diff --git a/.ci/generate_test_report.py b/.ci/generate_test_report.py index ff601a0cde106..6f2137e7803bb 100644 --- a/.ci/generate_test_report.py +++ b/.ci/generate_test_report.py @@ -19,12 +19,13 @@ def junit_from_xml(xml): class TestReports(unittest.TestCase): def test_title_only(self): - self.assertEqual(_generate_report("Foo", []), ("", "success")) + self.assertEqual(_generate_report("Foo", 0, []), ("", "success")) def test_no_tests_in_testsuite(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -45,6 +46,7 @@ def test_no_failures(self): self.assertEqual( _generate_report( "Foo", + 0, [ junit_from_xml( dedent( @@ -70,10 +72,51 @@ def test_no_failures(self): ), ) + def test_no_failures_build_failed(self): + self.assertEqual( + _generate_report( + "Foo", + 1, + [ + junit_from_xml( + dedent( + """\ + + + + + + """ + ) + ) + ], + buildkite_info={ + "BUILDKITE_ORGANIZATION_SLUG": "organization_slug", + "BUILDKITE_PIPELINE_SLUG": "pipeline_slug", + "BUILDKITE_BUILD_NUMBER": "build_number", + "BUILDKITE_JOB_ID": "job_id", + }, + ), + ( + dedent( + """\ + # Foo + + * 1 test passed + + All tests passed but another part of the build **failed**. + + [Download](https://buildkite.com/organizations/organization_slug/pipelines/pipeline_slug/builds/build_number/jobs/job_id/download.txt) the build's log file to see the details.""" + ), + "error", + ), + ) + def test_report_single_file_single_testsuite(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -166,6 +209,7 @@ def test_report_single_file_multiple_testsuites(self): self.assertEqual( _generate_report( "ABC and DEF", + 1, [ junit_from_xml( dedent( @@ -198,6 +242,7 @@ def test_report_multiple_files_multiple_testsuites(self): self.assertEqual( _generate_report( "ABC and DEF", + 1, [ junit_from_xml( dedent( @@ -238,6 +283,7 @@ def test_report_dont_list_failures(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -272,6 +318,7 @@ def test_report_dont_list_failures_link_to_log(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -312,6 +359,7 @@ def test_report_size_limit(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -351,12 +399,18 @@ def test_report_size_limit(self): # and output will not be. def _generate_report( title, + return_code, junit_objects, size_limit=1024 * 1024, list_failures=True, buildkite_info=None, ): if not junit_objects: + # Note that we do not post an empty report, therefore we can ignore a + # non-zero return code in situations like this. + # + # If we were going to post a report, then yes, it would be misleading + # to say we succeeded when the final return code was non-zero. return ("", "success") failures = {} @@ -385,7 +439,11 @@ def _generate_report( if not tests_run: return ("", None) - style = "error" if tests_failed else "success" + style = "success" + # Either tests failed, or all tests passed but something failed to build. + if tests_failed or return_code != 0: + style = "error" + report = [f"# {title}", ""] tests_passed = tests_run - tests_skipped - tests_failed @@ -400,17 +458,17 @@ def plural(num_tests): if tests_failed: report.append(f"* {tests_failed} {plural(tests_failed)} failed") - if not list_failures: - if buildkite_info is not None: - log_url = ( - "https://buildkite.com/organizations/{BUILDKITE_ORGANIZATION_SLUG}/" - "pipelines/{BUILDKITE_PIPELINE_SLUG}/builds/{BUILDKITE_BUILD_NUMBER}/" - "jobs/{BUILDKITE_JOB_ID}/download.txt".format(**buildkite_info) - ) - download_text = f"[Download]({log_url})" - else: - download_text = "Download" + if buildkite_info is not None: + log_url = ( + "https://buildkite.com/organizations/{BUILDKITE_ORGANIZATION_SLUG}/" + "pipelines/{BUILDKITE_PIPELINE_SLUG}/builds/{BUILDKITE_BUILD_NUMBER}/" + "jobs/{BUILDKITE_JOB_ID}/download.txt".format(**buildkite_info) + ) + download_text = f"[Download]({log_url})" + else: + download_text = "Download" + if not list_failures: report.extend( [ "", @@ -435,11 +493,23 @@ def plural(num_tests): "", ] ) + elif return_code != 0: + # No tests failed but the build was in a failed state. Bring this to the user's + # attention. + report.extend( + [ + "", + "All tests passed but another part of the build **failed**.", + "", + f"{download_text} the build's log file to see the details.", + ] + ) report = "\n".join(report) if len(report.encode("utf-8")) > size_limit: return _generate_report( title, + return_code, junit_objects, size_limit, list_failures=False, @@ -449,9 +519,10 @@ def plural(num_tests): return report, style -def generate_report(title, junit_files, buildkite_info): +def generate_report(title, return_code, junit_files, buildkite_info): return _generate_report( title, + return_code, [JUnitXml.fromfile(p) for p in junit_files], buildkite_info=buildkite_info, ) @@ -463,6 +534,7 @@ def generate_report(title, junit_files, buildkite_info): "title", help="Title of the test report, without Markdown formatting." ) parser.add_argument("context", help="Annotation context to write to.") + parser.add_argument("return_code", help="The build's return code.", type=int) parser.add_argument("junit_files", help="Paths to JUnit report files.", nargs="*") args = parser.parse_args() @@ -477,7 +549,9 @@ def generate_report(title, junit_files, buildkite_info): if len(buildkite_info) != len(env_var_names): buildkite_info = None - report, style = generate_report(args.title, args.junit_files, buildkite_info) + report, style = generate_report( + args.title, args.return_code, args.junit_files, buildkite_info + ) if report: p = subprocess.Popen( diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index 4bfebd5f75279..55741bc831046 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -29,6 +29,8 @@ if [[ -n "${CLEAR_CACHE:-}" ]]; then fi function at-exit { + retcode=$? + mkdir -p artifacts ccache --print-stats > artifacts/ccache_stats.txt @@ -37,7 +39,7 @@ function at-exit { if command -v buildkite-agent 2>&1 >/dev/null then python3 "${MONOREPO_ROOT}"/.ci/generate_test_report.py ":linux: Linux x64 Test Results" \ - "linux-x64-test-results" "${BUILD_DIR}"/test-results.*.xml + "linux-x64-test-results" $retcode "${BUILD_DIR}"/test-results.*.xml fi } trap at-exit EXIT diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh index 25cdd2f419f47..68303a3ea153a 100755 --- a/.ci/monolithic-windows.sh +++ b/.ci/monolithic-windows.sh @@ -28,6 +28,8 @@ fi sccache --zero-stats function at-exit { + retcode=$? + mkdir -p artifacts sccache --show-stats >> artifacts/sccache_stats.txt @@ -36,7 +38,7 @@ function at-exit { if command -v buildkite-agent 2>&1 >/dev/null then python "${MONOREPO_ROOT}"/.ci/generate_test_report.py ":windows: Windows x64 Test Results" \ - "windows-x64-test-results" "${BUILD_DIR}"/test-results.*.xml + "windows-x64-test-results" $retcode "${BUILD_DIR}"/test-results.*.xml fi } trap at-exit EXIT diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index e020710c7aa4f..2eb0777dbdc6c 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -2137,8 +2137,8 @@ method; it specifies that the method expects its ``self`` parameter to have a - (void) bar __attribute__((ns_consumes_self)); - (void) baz:(id) __attribute__((ns_consumed)) x; -Further examples of these attributes are available in the static analyzer's `list of annotations for analysis -`_. +Further examples of these attributes are available in the static analyzer's +`list of annotations for analysis `__. Query for these features with ``__has_attribute(ns_consumed)``, ``__has_attribute(ns_returns_retained)``, etc. @@ -4792,8 +4792,8 @@ Extensions for Static Analysis Clang supports additional attributes that are useful for documenting program invariants and rules for static analysis tools, such as the `Clang Static Analyzer `_. These attributes are documented -in the analyzer's `list of source-level annotations -`_. +in the analyzer's `list of annotations for analysis +`__. Extensions for Dynamic Analysis diff --git a/clang/docs/Modules.rst b/clang/docs/Modules.rst index 06294e3c58a4f..69a45b7fd9ace 100644 --- a/clang/docs/Modules.rst +++ b/clang/docs/Modules.rst @@ -152,7 +152,7 @@ first include path that would refer to the current file. ``#include_next`` is interpreted as if the current file had been found in that path. If this search finds a file named by a module map, the ``#include_next`` directive is translated into an import, just like for a ``#include`` -directive.`` +directive. Module maps ----------- diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index a1cb7fe359ebf..673c34bf08a4a 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -286,6 +286,8 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory management | 'allocator' modifier for allocate clause | :good:`done` | https://github.com/llvm/llvm-project/pull/114883 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | 'align' modifier for allocate clause | :good:`done` | https://github.com/llvm/llvm-project/pull/121814 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory management | new memory management routines | :none:`unclaimed` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory management | changes to omp_alloctrait_key enum | :none:`unclaimed` | | diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a14fb189c8e13..9eeb872aa57d7 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -58,6 +58,29 @@ code bases. containing strict-aliasing violations. The new default behavior can be disabled using ``-fno-pointer-tbaa``. +- Clang will now more aggressively use undefined behavior on pointer addition + overflow for optimization purposes. For example, a check like + ``ptr + unsigned_offset < ptr`` will now optimize to ``false``, because + ``ptr + unsigned_offset`` will cause undefined behavior if it overflows (or + advances past the end of the object). + + Previously, ``ptr + unsigned_offset < ptr`` was optimized (by both Clang and + GCC) to ``(ssize_t)unsigned_offset < 0``. This also results in an incorrect + overflow check, but in a way that is less apparent when only testing with + pointers in the low half of the address space. + + To avoid pointer addition overflow, it is necessary to perform the addition + on integers, for example using + ``(uintptr_t)ptr + unsigned_offset < (uintptr_t)ptr``. Sometimes, it is also + possible to rewrite checks by only comparing the offset. For example, + ``ptr + offset < end_ptr && ptr + offset >= ptr`` can be written as + ``offset < (uintptr_t)(end_ptr - ptr)``. + + Undefined behavior due to pointer addition overflow can be reliably detected + using ``-fsanitize=pointer-overflow``. It is also possible to use + ``-fno-strict-overflow`` to opt-in to a language dialect where signed integer + and pointer overflow are well-defined. + C/C++ Language Potentially Breaking Changes ------------------------------------------- @@ -1347,6 +1370,7 @@ OpenMP Support always build support for AMDGPU and NVPTX targets. - Added support for combined masked constructs 'omp parallel masked taskloop', 'omp parallel masked taskloop simd','omp masked taskloop' and 'omp masked taskloop simd' directive. +- Added support for align-modifier in 'allocate' clause. Improvements ^^^^^^^^^^^^ diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 4de288250f3ad..260e84910c6f7 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -1364,10 +1364,8 @@ Controlling Static Analyzer Diagnostics While not strictly part of the compiler, the diagnostics from Clang's `static analyzer `_ can also be influenced by the user via changes to the source code. See the available -`annotations `_ and the -analyzer's `FAQ -page `_ for more -information. +`annotations `_ and the analyzer's +`FAQ page `_ for more information. .. _usersmanual-precompiled-headers: diff --git a/clang/www/analyzer/images/example_attribute_nonnull.png b/clang/docs/analyzer/images/example_attribute_nonnull.png similarity index 100% rename from clang/www/analyzer/images/example_attribute_nonnull.png rename to clang/docs/analyzer/images/example_attribute_nonnull.png diff --git a/clang/www/analyzer/images/example_cf_returns_retained.png b/clang/docs/analyzer/images/example_cf_returns_retained.png similarity index 100% rename from clang/www/analyzer/images/example_cf_returns_retained.png rename to clang/docs/analyzer/images/example_cf_returns_retained.png diff --git a/clang/www/analyzer/images/example_custom_assert.png b/clang/docs/analyzer/images/example_custom_assert.png similarity index 100% rename from clang/www/analyzer/images/example_custom_assert.png rename to clang/docs/analyzer/images/example_custom_assert.png diff --git a/clang/www/analyzer/images/example_ns_returns_retained.png b/clang/docs/analyzer/images/example_ns_returns_retained.png similarity index 100% rename from clang/www/analyzer/images/example_ns_returns_retained.png rename to clang/docs/analyzer/images/example_ns_returns_retained.png diff --git a/clang/www/analyzer/images/example_null_pointer.png b/clang/docs/analyzer/images/example_null_pointer.png similarity index 100% rename from clang/www/analyzer/images/example_null_pointer.png rename to clang/docs/analyzer/images/example_null_pointer.png diff --git a/clang/www/analyzer/images/example_use_assert.png b/clang/docs/analyzer/images/example_use_assert.png similarity index 100% rename from clang/www/analyzer/images/example_use_assert.png rename to clang/docs/analyzer/images/example_use_assert.png diff --git a/clang/docs/analyzer/user-docs.rst b/clang/docs/analyzer/user-docs.rst index dd53ae143148c..e265f033a2c54 100644 --- a/clang/docs/analyzer/user-docs.rst +++ b/clang/docs/analyzer/user-docs.rst @@ -12,4 +12,5 @@ Contents: user-docs/FilingBugs user-docs/CrossTranslationUnit user-docs/TaintAnalysisConfiguration + user-docs/Annotations user-docs/FAQ diff --git a/clang/docs/analyzer/user-docs/Annotations.rst b/clang/docs/analyzer/user-docs/Annotations.rst new file mode 100644 index 0000000000000..d87e8f4df99c3 --- /dev/null +++ b/clang/docs/analyzer/user-docs/Annotations.rst @@ -0,0 +1,689 @@ +================== +Source Annotations +================== + +The Clang frontend supports several source-level annotations in the form of +`GCC-style attributes `_ +and pragmas that can help make using the Clang Static Analyzer more useful. +These annotations can both help suppress false positives as well as enhance the +analyzer's ability to find bugs. + +This page gives a practical overview of such annotations. For more technical +specifics regarding Clang-specific annotations please see the Clang's list of +`language extensions `_. +Details of "standard" GCC attributes (that Clang also supports) can +be found in the `GCC manual `_, with the +majority of the relevant attributes being in the section on +`function attributes `_. + +Note that attributes that are labeled **Clang-specific** are not +recognized by GCC. Their use can be conditioned using preprocessor macros +(examples included on this page). + +.. contents:: + :local: + +Annotations to Enhance Generic Checks +_____________________________________ + +Null Pointer Checking +##################### + +Attribute 'nonnull' +------------------- + +The analyzer recognizes the GCC attribute 'nonnull', which indicates that a +function expects that a given function parameter is not a null pointer. +Specific details of the syntax of using the 'nonnull' attribute can be found in +`GCC's documentation `_. + +Both the Clang compiler and GCC will flag warnings for simple cases where a +null pointer is directly being passed to a function with a 'nonnull' parameter +(e.g., as a constant). The analyzer extends this checking by using its deeper +symbolic analysis to track what pointer values are potentially null and then +flag warnings when they are passed in a function call via a 'nonnull' +parameter. + +**Example** + +.. code-block:: c + + int bar(int*p, int q, int *r) __attribute__((nonnull(1,3))); + + int foo(int *p, int *q) { + return !p ? bar(q, 2, p) + : bar(p, 2, q); + } + +Running ``scan-build`` over this source produces the following output: + +.. image:: ../images/example_attribute_nonnull.png + +.. _custom_assertion_handlers: + +Custom Assertion Handlers +######################### + +The analyzer exploits code assertions by pruning off paths where the +assertion condition is false. The idea is capture any program invariants +specified in the assertion that the developer may know but is not immediately +apparent in the code itself. In this way assertions make implicit assumptions +explicit in the code, which not only makes the analyzer more accurate when +finding bugs, but can help others better able to understand your code as well. +It can also help remove certain kinds of analyzer false positives by pruning off +false paths. + +In order to exploit assertions, however, the analyzer must understand when it +encounters an "assertion handler". Typically assertions are +implemented with a macro, with the macro performing a check for the assertion +condition and, when the check fails, calling an assertion handler. For +example, consider the following code fragment: + +.. code-block: c + + void foo(int *p) { + assert(p != NULL); + } + +When this code is preprocessed on Mac OS X it expands to the following: + +.. code-block: c + + void foo(int *p) { + (__builtin_expect(!(p != NULL), 0) ? __assert_rtn(__func__, "t.c", 4, "p != NULL") : (void)0); + } + +In this example, the assertion handler is ``__assert_rtn``. When called, +most assertion handlers typically print an error and terminate the program. The +analyzer can exploit such semantics by ending the analysis of a path once it +hits a call to an assertion handler. + +The trick, however, is that the analyzer needs to know that a called function +is an assertion handler; otherwise the analyzer might assume the function call +returns and it will continue analyzing the path where the assertion condition +failed. This can lead to false positives, as the assertion condition usually +implies a safety condition (e.g., a pointer is not null) prior to performing +some action that depends on that condition (e.g., dereferencing a pointer). + +The analyzer knows about several well-known assertion handlers, but can +automatically infer if a function should be treated as an assertion handler if +it is annotated with the 'noreturn' attribute or the (Clang-specific) +'analyzer_noreturn' attribute. Note that, currently, clang does not support +these attributes on Objective-C methods and C++ methods. + +Attribute 'noreturn' +-------------------- + +The 'noreturn' attribute is a GCC attribute that can be placed on the +declarations of functions. It means exactly what its name implies: a function +with a 'noreturn' attribute should never return. + +Specific details of the syntax of using the 'noreturn' attribute can be found +in `GCC's documentation `__. + +Not only does the analyzer exploit this information when pruning false paths, +but the compiler also takes it seriously and will generate different code (and +possibly better optimized) under the assumption that the function does not +return. + +**Example** + +On Mac OS X, the function prototype for ``__assert_rtn`` (declared in +``assert.h``) is specifically annotated with the 'noreturn' attribute: + +.. code-block: c + + void __assert_rtn(const char *, const char *, int, const char *) __attribute__((__noreturn__)); + +Attribute 'analyzer_noreturn' (Clang-specific) +---------------------------------------------- + +The Clang-specific 'analyzer_noreturn' attribute is almost identical to +'noreturn' except that it is ignored by the compiler for the purposes of code +generation. + +This attribute is useful for annotating assertion handlers that actually +*can* return, but for the purpose of using the analyzer we want to +pretend that such functions do not return. + +Because this attribute is Clang-specific, its use should be conditioned with +the use of preprocessor macros. + +**Example** + +.. code-block: c + + #ifndef CLANG_ANALYZER_NORETURN + #if __has_feature(attribute_analyzer_noreturn) + #define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn)) + #else + #define CLANG_ANALYZER_NORETURN + #endif + #endif + + void my_assert_rtn(const char *, const char *, int, const char *) CLANG_ANALYZER_NORETURN; + +Mac OS X API Annotations +________________________ + +.. _cocoa_mem: + +Cocoa & Core Foundation Memory Management Annotations +##################################################### + +The analyzer supports the proper management of retain counts for +both Cocoa and Core Foundation objects. This checking is largely based on +enforcing Cocoa and Core Foundation naming conventions for Objective-C methods +(Cocoa) and C functions (Core Foundation). Not strictly following these +conventions can cause the analyzer to miss bugs or flag false positives. + +One can educate the analyzer (and others who read your code) about methods or +functions that deviate from the Cocoa and Core Foundation conventions using the +attributes described here. However, you should consider using proper naming +conventions or the `objc_method_family `_ +attribute, if applicable. + +.. _ns_returns_retained: + +Attribute 'ns_returns_retained' (Clang-specific) +------------------------------------------------ + +The GCC-style (Clang-specific) attribute 'ns_returns_retained' allows one to +annotate an Objective-C method or C function as returning a retained Cocoa +object that the caller is responsible for releasing (via sending a +``release`` message to the object). The Foundation framework defines a +macro ``NS_RETURNS_RETAINED`` that is functionally equivalent to the +one shown below. + +**Placing on Objective-C methods**: For Objective-C methods, this +annotation essentially tells the analyzer to treat the method as if its name +begins with "alloc" or "new" or contains the word +"copy". + +**Placing on C functions**: For C functions returning Cocoa objects, the +analyzer typically does not make any assumptions about whether or not the object +is returned retained. Explicitly adding the 'ns_returns_retained' attribute to C +functions allows the analyzer to perform extra checking. + +**Example** + +.. code-block: objc + + #import ; + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef NS_RETURNS_RETAINED + #if __has_feature(attribute_ns_returns_retained) + #define NS_RETURNS_RETAINED __attribute__((ns_returns_retained)) + #else + #define NS_RETURNS_RETAINED + #endif + #endif + + @interface MyClass : NSObject {} + - (NSString*) returnsRetained NS_RETURNS_RETAINED; + - (NSString*) alsoReturnsRetained; + @end + + @implementation MyClass + - (NSString*) returnsRetained { + return [[NSString alloc] initWithCString:"no leak here"]; + } + - (NSString*) alsoReturnsRetained { + return [[NSString alloc] initWithCString:"flag a leak"]; + } + @end + +Running ``scan-build`` on this source file produces the following output: + +.. image:: ../images/example_ns_returns_retained.png + +.. _ns_returns_not_retained: + +Attribute 'ns_returns_not_retained' (Clang-specific) +---------------------------------------------------- + +The 'ns_returns_not_retained' attribute is the complement of +'`ns_returns_retained`_'. Where a function or method may appear to obey the +Cocoa conventions and return a retained Cocoa object, this attribute can be +used to indicate that the object reference returned should not be considered as +an "owning" reference being returned to the caller. The Foundation +framework defines a macro ``NS_RETURNS_NOT_RETAINED`` that is functionally +equivalent to the one shown below. + +Usage is identical to `ns_returns_retained`_. When using the +attribute, be sure to declare it within the proper macro that checks for +its availability, as it is not available in earlier versions of the analyzer: + +.. code-block:objc + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef NS_RETURNS_NOT_RETAINED + #if __has_feature(attribute_ns_returns_not_retained) + #define NS_RETURNS_NOT_RETAINED __attribute__((ns_returns_not_retained)) + #else + #define NS_RETURNS_NOT_RETAINED + #endif + #endif + +.. _cf_returns_retained: + +Attribute 'cf_returns_retained' (Clang-specific) +------------------------------------------------ + +The GCC-style (Clang-specific) attribute 'cf_returns_retained' allows one to +annotate an Objective-C method or C function as returning a retained Core +Foundation object that the caller is responsible for releasing. The +CoreFoundation framework defines a macro ``CF_RETURNS_RETAINED`` that is +functionally equivalent to the one shown below. + +**Placing on Objective-C methods**: With respect to Objective-C methods., +this attribute is identical in its behavior and usage to 'ns_returns_retained' +except for the distinction of returning a Core Foundation object instead of a +Cocoa object. + +This distinction is important for the following reason: as Core Foundation is a +C API, the analyzer cannot always tell that a pointer return value refers to a +Core Foundation object. In contrast, it is trivial for the analyzer to +recognize if a pointer refers to a Cocoa object (given the Objective-C type +system). + +**Placing on C functions**: When placing the attribute +'cf_returns_retained' on the declarations of C functions, the analyzer +interprets the function as: + +1. Returning a Core Foundation Object +2. Treating the function as if it its name contained the keywords + "create" or "copy". This means the returned object as a + +1 retain count that must be released by the caller, either by sending a + ``release`` message (via toll-free bridging to an Objective-C object + pointer), or calling ``CFRelease`` or a similar function. + +**Example** + +.. code-block:objc + + #import + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef CF_RETURNS_RETAINED + #if __has_feature(attribute_cf_returns_retained) + #define CF_RETURNS_RETAINED __attribute__((cf_returns_retained)) + #else + #define CF_RETURNS_RETAINED + #endif + #endif + + @interface MyClass : NSObject {} + - (NSDate*) returnsCFRetained CF_RETURNS_RETAINED; + - (NSDate*) alsoReturnsRetained; + - (NSDate*) returnsNSRetained NS_RETURNS_RETAINED; + @end + + CF_RETURNS_RETAINED + CFDateRef returnsRetainedCFDate() { + return CFDateCreate(0, CFAbsoluteTimeGetCurrent()); + } + + @implementation MyClass + - (NSDate*) returnsCFRetained { + return (NSDate*) returnsRetainedCFDate(); // No leak. + } + + - (NSDate*) alsoReturnsRetained { + return (NSDate*) returnsRetainedCFDate(); // Always report a leak. + } + + - (NSDate*) returnsNSRetained { + return (NSDate*) returnsRetainedCFDate(); // Report a leak when using GC. + } + @end + +Running ``scan-build`` on this example produces the following output: + +.. image:: ../images/example_cf_returns_retained.png + +Attribute 'cf_returns_not_retained' (Clang-specific) +---------------------------------------------------- + +The 'cf_returns_not_retained' attribute is the complement of +'`cf_returns_retained`_'. Where a function or method may appear to obey the +Core Foundation or Cocoa conventions and return a retained Core Foundation +object, this attribute can be used to indicate that the object reference +returned should not be considered as an "owning" reference being +returned to the caller. The CoreFoundation framework defines a macro +**``CF_RETURNS_NOT_RETAINED``** that is functionally equivalent to the one +shown below. + +Usage is identical to cf_returns_retained_. When using the attribute, be sure +to declare it within the proper macro that checks for its availability, as it +is not available in earlier versions of the analyzer: + +.. code-block:objc + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef CF_RETURNS_NOT_RETAINED + #if __has_feature(attribute_cf_returns_not_retained) + #define CF_RETURNS_NOT_RETAINED __attribute__((cf_returns_not_retained)) + #else + #define CF_RETURNS_NOT_RETAINED + #endif + #endif + +.. _ns_consumed: + +Attribute 'ns_consumed' (Clang-specific) +---------------------------------------- + +The 'ns_consumed' attribute can be placed on a specific parameter in either +the declaration of a function or an Objective-C method. It indicates to the +static analyzer that a ``release`` message is implicitly sent to the +parameter upon completion of the call to the given function or method. The +Foundation framework defines a macro ``NS_RELEASES_ARGUMENT`` that +is functionally equivalent to the ``NS_CONSUMED`` macro shown below. + +**Example** + +.. code-block:objc + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef NS_CONSUMED + #if __has_feature(attribute_ns_consumed) + #define NS_CONSUMED __attribute__((ns_consumed)) + #else + #define NS_CONSUMED + #endif + #endif + + void consume_ns(id NS_CONSUMED x); + + void test() { + id x = [[NSObject alloc] init]; + consume_ns(x); // No leak! + } + + @interface Foo : NSObject + + (void) releaseArg:(id) NS_CONSUMED x; + + (void) releaseSecondArg:(id)x second:(id) NS_CONSUMED y; + @end + + void test_method() { + id x = [[NSObject alloc] init]; + [Foo releaseArg:x]; // No leak! + } + + void test_method2() { + id a = [[NSObject alloc] init]; + id b = [[NSObject alloc] init]; + [Foo releaseSecondArg:a second:b]; // 'a' is leaked, but 'b' is released. + } + +Attribute 'cf_consumed' (Clang-specific) +---------------------------------------- + +The 'cf_consumed' attribute is practically identical to ns_consumed_. The +attribute can be placed on a specific parameter in either the declaration of a +function or an Objective-C method. It indicates to the static analyzer that the +object reference is implicitly passed to a call to ``CFRelease`` upon +completion of the call to the given function or method. The CoreFoundation +framework defines a macro ``CF_RELEASES_ARGUMENT`` that is functionally +equivalent to the ``CF_CONSUMED`` macro shown below. + +Operationally this attribute is nearly identical to 'ns_consumed'. + +**Example** + +.. code-block:objc + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef CF_CONSUMED + #if __has_feature(attribute_cf_consumed) + #define CF_CONSUMED __attribute__((cf_consumed)) + #else + #define CF_CONSUMED + #endif + #endif + + void consume_cf(id CF_CONSUMED x); + void consume_CFDate(CFDateRef CF_CONSUMED x); + + void test() { + id x = [[NSObject alloc] init]; + consume_cf(x); // No leak! + } + + void test2() { + CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent()); + consume_CFDate(date); // No leak, including under GC! + + } + + @interface Foo : NSObject + + (void) releaseArg:(CFDateRef) CF_CONSUMED x; + @end + + void test_method() { + CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent()); + [Foo releaseArg:date]; // No leak! + } + +.. _ns_consumes_self: + +Attribute 'ns_consumes_self' (Clang-specific) +--------------------------------------------- + +The 'ns_consumes_self' attribute can be placed only on an Objective-C method +declaration. It indicates that the receiver of the message is +"consumed" (a single reference count decremented) after the message +is sent. This matches the semantics of all "init" methods. + +One use of this attribute is declare your own init-like methods that do not +follow the standard Cocoa naming conventions. + +**Example** + +.. code-block:objc + #ifndef __has_feature + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef NS_CONSUMES_SELF + #if __has_feature((attribute_ns_consumes_self)) + #define NS_CONSUMES_SELF __attribute__((ns_consumes_self)) + #else + #define NS_CONSUMES_SELF + #endif + #endif + + @interface MyClass : NSObject + - initWith:(MyClass *)x; + - nonstandardInitWith:(MyClass *)x NS_CONSUMES_SELF NS_RETURNS_RETAINED; + @end + +In this example, ``-nonstandardInitWith:`` has the same ownership +semantics as the init method ``-initWith:``. The static analyzer will +observe that the method consumes the receiver, and then returns an object with +a +1 retain count. + +The Foundation framework defines a macro ``NS_REPLACES_RECEIVER`` which is +functionally equivalent to the combination of ``NS_CONSUMES_SELF`` and +``NS_RETURNS_RETAINED`` shown above. + +Libkern Memory Management Annotations +##################################### + +`Libkern `_ +requires developers to inherit all heap allocated objects from ``OSObject`` and +to perform manual reference counting. The reference counting model is very +similar to MRR (manual retain-release) mode in +`Objective-C `_ +or to CoreFoundation reference counting. +Freshly-allocated objects start with a reference count of 1, and calls to +``retain`` increment it, while calls to ``release`` decrement it. The object is +deallocated whenever its reference count reaches zero. + +Manually incrementing and decrementing reference counts is error-prone: +over-retains lead to leaks, and over-releases lead to uses-after-free. +The analyzer can help the programmer to check for unbalanced +retain/release calls. + +The reference count checking is based on the principle of *locality*: it should +be possible to establish correctness (lack of leaks/uses after free) by looking +at each function body, and the declarations (not the definitions) of all the +functions it interacts with. + +In order to support such reasoning, it should be possible to *summarize* the +behavior of each function, with respect to reference count of its returned +values and attributes. + +By default, the following summaries are assumed: + +- All functions starting with ``get`` or ``Get``, unless they are returning + subclasses of ``OSIterator``, are assumed to be returning at +0. That is, the + caller has no reference count *obligations* with respect to the reference + count of the returned object and should leave it untouched. + +- All other functions are assumed to return at +1. That is, the caller has an + *obligation* to release such objects. + +- Functions are assumed not to change the reference count of their parameters, + including the implicit ``this`` parameter. + +These summaries can be overriden with the following +`attributes `_: + +Attribute 'os_returns_retained' +------------------------------- + +The ``os_returns_retained`` attribute (accessed through the macro +``LIBKERN_RETURNS_RETAINED``) plays a role identical to `ns_returns_retained`_ +for functions returning ``OSObject`` subclasses. The attribute indicates that +it is a callers responsibility to release the returned object. + +Attribute 'os_returns_not_retained' +----------------------------------- + +The ``os_returns_not_retained`` attribute (accessed through the macro +``LIBKERN_RETURNS_NOT_RETAINED``) plays a role identical to +`ns_returns_not_retained`_ for functions returning ``OSObject`` subclasses. The +attribute indicates that the caller should not change the retain count of the +returned object. + + +**Example** + +.. code-block:objc + + class MyClass { + OSObject *f; + LIBKERN_RETURNS_NOT_RETAINED OSObject *myFieldGetter(); + } + + + // Note that the annotation only has to be applied to the function declaration. + OSObject * MyClass::myFieldGetter() { + return f; + } + +Attribute 'os_consumed' +----------------------- + +Similarly to `ns_consumed`_ attribute, ``os_consumed`` (accessed through +``LIBKERN_CONSUMED``) attribute, applied to a parameter, indicates that the +call to the function *consumes* the parameter: the callee should either release +it or store it and release it in the destructor, while the caller should assume +one is subtracted from the reference count after the call. + +.. code-block:objc + IOReturn addToList(LIBKERN_CONSUMED IOPMinformee *newInformee); + +Attribute 'os_consumes_this' +---------------------------- + +Similarly to `ns_consumes_self`_, the ``os_consumes_self`` attribute indicates +that the method call *consumes* the implicit ``this`` argument: the caller +should assume one was subtracted from the reference count of the object after +the call, and the callee has on obligation to either release the argument, or +store it and eventually release it in the destructor. + + +.. code-block:objc + void addThisToList(OSArray *givenList) LIBKERN_CONSUMES_THIS; + +Out Parameters +-------------- + +A function can also return an object to a caller by a means of an out parameter +(a pointer-to-OSObject-pointer is passed, and a callee writes a pointer to an +object into an argument). Currently the analyzer does not track unannotated out +parameters by default, but with annotations we distinguish four separate cases: + +**1. Non-retained out parameters**, identified using +``LIBKERN_RETURNS_NOT_RETAINED`` applied to parameters, e.g.: + +.. code-block:objc + void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj) + +Such functions write a non-retained object into an out parameter, and the +caller has no further obligations. + +**2. Retained out parameters**, identified using ``LIBKERN_RETURNS_RETAINED``: + +.. code-block:objc + void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj) + +In such cases a retained object is written into an out parameter, which the caller has then to release in order to avoid a leak. + +These two cases are simple - but in practice a functions returning an +out-parameter usually also return a return code, and then an out parameter may +or may not be written, which conditionally depends on the exit code, e.g.: + +.. code-block:objc + bool maybeCreateObject(LIBKERN_RETURNS_RETAINED OSObject **obj); + +For such functions, the usual semantics is that an object is written into on "success", and not written into on "failure". + +For ``LIBKERN_RETURNS_RETAINED`` we assume the following definition of +success: + +- For functions returning ``OSReturn`` or ``IOReturn`` (any typedef to + ``kern_return_t``) success is defined as having an output of zero + (``kIOReturnSuccess`` is zero). + +- For all others, success is non-zero (e.g. non-nullptr for pointers) + +**3. Retained out parameters on zero return** The annotation +``LIBKERN_RETURNS_RETAINED_ON_ZERO`` states that a retained object is written +into if and only if the function returns a zero value: + +.. code-block:objc + bool OSUnserializeXML(void *data, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errString); + +Then the caller has to release an object if the function has returned zero. + +**4. Retained out parameters on non-zero return** Similarly, +``LIBKERN_RETURNS_RETAINED_ON_NONZERO`` specifies that a retained object is +written into the parameter if and only if the function has returned a non-zero +value. + +Note that for non-retained out parameters conditionals do not matter, as the +caller has no obligations regardless of whether an object is written into or +not. diff --git a/clang/docs/analyzer/user-docs/FAQ.rst b/clang/docs/analyzer/user-docs/FAQ.rst index af52e99c91d68..58eac783efccd 100644 --- a/clang/docs/analyzer/user-docs/FAQ.rst +++ b/clang/docs/analyzer/user-docs/FAQ.rst @@ -9,7 +9,9 @@ Custom Assertions Q: How do I tell the analyzer that I do not want the bug being reported here since my custom error handler will safely end the execution before the bug is reached? -You can tell the analyzer that this path is unreachable by teaching it about your `custom assertion handlers `_. For example, you can modify the code segment as following: +.. image:: ../images/example_custom_assert.png + +You can tell the analyzer that this path is unreachable by teaching it about your `custom assertion handlers `__. For example, you can modify the code segment as following: .. code-block:: c @@ -25,6 +27,8 @@ Null Pointer Dereference Q: The analyzer reports a null dereference, but I know that the pointer is never null. How can I tell the analyzer that a pointer can never be null? +.. image:: ../images/example_null_pointer.png + The reason the analyzer often thinks that a pointer can be null is because the preceding code checked compared it against null. If you are absolutely sure that it cannot be null, remove the preceding check and, preferably, add an assertion as well. For example: .. code-block:: c @@ -143,6 +147,8 @@ Ensuring Loop Body Execution Q: The analyzer assumes that a loop body is never entered. How can I tell it that the loop body will be entered at least once? +.. image:: ../images/example_use_assert.png + In cases where you know that a loop will always be entered at least once, you can use assertions to inform the analyzer. For example: .. code-block:: c @@ -162,7 +168,7 @@ Suppressing Specific Warnings Q: How can I suppress a specific analyzer warning? -When you encounter an analyzer bug/false positive, check if it's one of the issues discussed above or if the analyzer `annotations `_ can resolve the issue by helping the static analyzer understand the code better. Second, please `report it `_ to help us improve user experience. +When you encounter an analyzer bug/false positive, check if it's one of the issues discussed above or if the analyzer `annotations `__ can resolve the issue by helping the static analyzer understand the code better. Second, please `report it `_ to help us improve user experience. Sometimes there's really no "good" way to eliminate the issue. In such cases you can "silence" it directly by annotating the problematic line of code with the help of Clang attribute 'suppress': @@ -192,6 +198,8 @@ Sometimes there's really no "good" way to eliminate the issue. In such cases you return *result; // as well as this leak path } +.. _exclude_code: + Excluding Code from Analysis ---------------------------- diff --git a/clang/examples/Attribute/Attribute.cpp b/clang/examples/Attribute/Attribute.cpp index 3b90724ad2220..625f1645afbff 100644 --- a/clang/examples/Attribute/Attribute.cpp +++ b/clang/examples/Attribute/Attribute.cpp @@ -42,8 +42,8 @@ struct ExampleAttrInfo : public ParsedAttrInfo { const Decl *D) const override { // This attribute appertains to functions only. if (!isa(D)) { - S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str) - << Attr << Attr.isRegularKeywordAttribute() << "functions"; + S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type) + << Attr << Attr.isRegularKeywordAttribute() << ExpectedFunction; return false; } return true; @@ -99,8 +99,9 @@ struct ExampleAttrInfo : public ParsedAttrInfo { const Stmt *St) const override { // This attribute appertains to for loop statements only. if (!isa(St)) { - S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str) - << Attr << Attr.isRegularKeywordAttribute() << "for loop statements"; + S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type) + << Attr << Attr.isRegularKeywordAttribute() + << ExpectedForLoopStatement; return false; } return true; diff --git a/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp b/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp index 12d4c311586e6..f206a84ab1311 100644 --- a/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp +++ b/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp @@ -168,8 +168,9 @@ struct CallSuperAttrInfo : public ParsedAttrInfo { const Decl *D) const override { const auto *TheMethod = dyn_cast_or_null(D); if (!TheMethod || !TheMethod->isVirtual()) { - S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str) - << Attr << Attr.isRegularKeywordAttribute() << "virtual functions"; + S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type) + << Attr << Attr.isRegularKeywordAttribute() + << ExpectedVirtualFunction; return false; } MarkedMethods.insert(TheMethod); diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 82932e098c86f..77abd8b657a61 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -1334,7 +1334,7 @@ class DeclListNode { reference operator*() const { assert(Ptr && "dereferencing end() iterator"); - if (DeclListNode *CurNode = Ptr.dyn_cast()) + if (DeclListNode *CurNode = dyn_cast(Ptr)) return CurNode->D; return cast(Ptr); } @@ -1344,7 +1344,7 @@ class DeclListNode { inline iterator &operator++() { // ++It assert(!Ptr.isNull() && "Advancing empty iterator"); - if (DeclListNode *CurNode = Ptr.dyn_cast()) + if (DeclListNode *CurNode = dyn_cast(Ptr)) Ptr = CurNode->Rest; else Ptr = nullptr; diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index d2f5267e4da5e..b9088eff3bb52 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -498,6 +498,9 @@ class OMPAllocateClause final /// Allocator specified in the clause, or 'nullptr' if the default one is /// used. Expr *Allocator = nullptr; + /// Alignment specified in the clause, or 'nullptr' if the default one is + /// used. + Expr *Alignment = nullptr; /// Position of the ':' delimiter in the clause; SourceLocation ColonLoc; /// Modifier of 'allocate' clause. @@ -505,6 +508,41 @@ class OMPAllocateClause final /// Location of allocator modifier if any. SourceLocation AllocatorModifierLoc; + // ---------------------------------------------------------------------------- + + /// Modifiers for 'allocate' clause. + enum { FIRST, SECOND, NUM_MODIFIERS }; + OpenMPAllocateClauseModifier Modifiers[NUM_MODIFIERS]; + + /// Locations of modifiers. + SourceLocation ModifiersLoc[NUM_MODIFIERS]; + + /// Set the first allocate modifier. + /// + /// \param M Allocate modifier. + void setFirstAllocateModifier(OpenMPAllocateClauseModifier M) { + Modifiers[FIRST] = M; + } + + /// Set the second allocate modifier. + /// + /// \param M Allocate modifier. + void setSecondAllocateModifier(OpenMPAllocateClauseModifier M) { + Modifiers[SECOND] = M; + } + + /// Set location of the first allocate modifier. + void setFirstAllocateModifierLoc(SourceLocation Loc) { + ModifiersLoc[FIRST] = Loc; + } + + /// Set location of the second allocate modifier. + void setSecondAllocateModifierLoc(SourceLocation Loc) { + ModifiersLoc[SECOND] = Loc; + } + + // ---------------------------------------------------------------------------- + /// Build clause with number of variables \a N. /// /// \param StartLoc Starting location of the clause. @@ -514,15 +552,20 @@ class OMPAllocateClause final /// \param EndLoc Ending location of the clause. /// \param N Number of the variables in the clause. OMPAllocateClause(SourceLocation StartLoc, SourceLocation LParenLoc, - Expr *Allocator, SourceLocation ColonLoc, - OpenMPAllocateClauseModifier AllocatorModifier, - SourceLocation AllocatorModifierLoc, SourceLocation EndLoc, + Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc, + OpenMPAllocateClauseModifier Modifier1, + SourceLocation Modifier1Loc, + OpenMPAllocateClauseModifier Modifier2, + SourceLocation Modifier2Loc, SourceLocation EndLoc, unsigned N) : OMPVarListClause(llvm::omp::OMPC_allocate, StartLoc, LParenLoc, EndLoc, N), - Allocator(Allocator), ColonLoc(ColonLoc), - AllocatorModifier(AllocatorModifier), - AllocatorModifierLoc(AllocatorModifierLoc) {} + Allocator(Allocator), Alignment(Alignment), ColonLoc(ColonLoc) { + Modifiers[FIRST] = Modifier1; + Modifiers[SECOND] = Modifier2; + ModifiersLoc[FIRST] = Modifier1Loc; + ModifiersLoc[SECOND] = Modifier2Loc; + } /// Build an empty clause. /// @@ -530,7 +573,10 @@ class OMPAllocateClause final explicit OMPAllocateClause(unsigned N) : OMPVarListClause(llvm::omp::OMPC_allocate, SourceLocation(), SourceLocation(), - SourceLocation(), N) {} + SourceLocation(), N) { + Modifiers[FIRST] = OMPC_ALLOCATE_unknown; + Modifiers[SECOND] = OMPC_ALLOCATE_unknown; + } /// Sets location of ':' symbol in clause. void setColonLoc(SourceLocation CL) { ColonLoc = CL; } @@ -539,6 +585,7 @@ class OMPAllocateClause final void setAllocatorModifier(OpenMPAllocateClauseModifier AM) { AllocatorModifier = AM; } + void setAlignment(Expr *A) { Alignment = A; } public: /// Creates clause with a list of variables \a VL. @@ -554,19 +601,42 @@ class OMPAllocateClause final /// \param VL List of references to the variables. static OMPAllocateClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, - Expr *Allocator, SourceLocation ColonLoc, - OpenMPAllocateClauseModifier AllocatorModifier, - SourceLocation AllocatorModifierLoc, SourceLocation EndLoc, - ArrayRef VL); + Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc, + OpenMPAllocateClauseModifier Modifier1, SourceLocation Modifier1Loc, + OpenMPAllocateClauseModifier Modifier2, SourceLocation Modifier2Loc, + SourceLocation EndLoc, ArrayRef VL); /// Returns the allocator expression or nullptr, if no allocator is specified. Expr *getAllocator() const { return Allocator; } + /// Returns the alignment expression or nullptr, if no alignment specified. + Expr *getAlignment() const { return Alignment; } + /// Return 'allocate' modifier. OpenMPAllocateClauseModifier getAllocatorModifier() const { return AllocatorModifier; } + /// Get the first modifier of the clause. + OpenMPAllocateClauseModifier getFirstAllocateModifier() const { + return Modifiers[FIRST]; + } + + /// Get location of first modifier of the clause. + SourceLocation getFirstAllocateModifierLoc() const { + return ModifiersLoc[FIRST]; + } + + /// Get the second modifier of the clause. + OpenMPAllocateClauseModifier getSecondAllocateModifier() const { + return Modifiers[SECOND]; + } + + /// Get location of second modifier of the clause. + SourceLocation getSecondAllocateModifierLoc() const { + return ModifiersLoc[SECOND]; + } + /// Returns the location of the ':' delimiter. SourceLocation getColonLoc() const { return ColonLoc; } /// Return the location of the modifier. diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c0632aaa51625..a752d94b06fad 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4353,6 +4353,16 @@ def HLSLLoopHint: StmtAttr { let Documentation = [HLSLLoopHintDocs, HLSLUnrollHintDocs]; } +def HLSLControlFlowHint: StmtAttr { + /// [branch] + /// [flatten] + let Spellings = [Microsoft<"branch">, Microsoft<"flatten">]; + let Subjects = SubjectList<[IfStmt], + ErrorDiag, "'if' statements">; + let LangOpts = [HLSL]; + let Documentation = [InternalOnly]; +} + def CapturedRecord : InheritableAttr { // This attribute has no spellings as it is only ever created implicitly. let Spellings = []; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 953ff9a700e51..e10f24e239ece 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -1461,7 +1461,7 @@ Mind that many more checkers are affected by dynamic memory modeling changes to some extent. Further reading for other annotations: -`Source Annotations in the Clang Static Analyzer `_. +`Source Annotations in the Clang Static Analyzer `_. }]; } diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 86fcae209c40d..3309f59a981fc 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1658,6 +1658,8 @@ def warn_omp_depend_in_ordered_deprecated : Warning<"'depend' clause for" def warn_omp_invalid_attribute_for_ompx_attributes : Warning<"'ompx_attribute' clause only allows " "'amdgpu_flat_work_group_size', 'amdgpu_waves_per_eu', and 'launch_bounds'; " "%0 is ignored">, InGroup; +def err_omp_duplicate_modifier : Error<"duplicate modifier '%0' in '%1' clause">; +def err_omp_expected_modifier : Error<"expected modifier in '%0' clause">; // Pragma loop support. def err_pragma_loop_missing_argument : Error< diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f04381a32a415..8be4f946dce1c 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3799,7 +3799,14 @@ def warn_attribute_wrong_decl_type : Warning< "|types and namespaces" "|variables, functions and classes" "|kernel functions" - "|non-K&R-style functions}2">, + "|non-K&R-style functions" + "|for loop statements" + "|virtual functions" + "|parameters and implicit object parameters" + "|non-member functions" + "|functions, classes, or enumerations" + "|classes" + "|typedefs}2">, InGroup; def err_attribute_wrong_decl_type : Error; def warn_type_attribute_wrong_type : Warning< diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index 3f25e7aafe23b..76a861f416fd5 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -219,6 +219,7 @@ OPENMP_NUMTASKS_MODIFIER(strict) // Modifiers for 'allocate' clause. OPENMP_ALLOCATE_MODIFIER(allocator) +OPENMP_ALLOCATE_MODIFIER(align) // Modifiers for the 'doacross' clause. OPENMP_DOACROSS_MODIFIER(source) diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index 900ad6ca6d66f..3e5da2a6abc01 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -230,6 +230,10 @@ enum OpenMPAllocateClauseModifier { OMPC_ALLOCATE_unknown }; +/// Number of allowed allocate-modifiers. +static constexpr unsigned NumberOfOMPAllocateClauseModifiers = + OMPC_ALLOCATE_unknown; + /// Contains 'interop' data for 'append_args' and 'init' clauses. class Expr; struct OMPInteropInfo final { diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 1c6bdb8cad2d1..47f1754aeb629 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1988,7 +1988,7 @@ def SVSM4E : SInst<"svsm4e[_{d}]", "ddd", "Ui", MergeNone, "aarch64_sve_sm def SVSM4EKEY : SInst<"svsm4ekey[_{d}]", "ddd", "Ui", MergeNone, "aarch64_sve_sm4ekey", [IsOverloadNone]>; } -let SVETargetGuard = "sve2-bitperm", SMETargetGuard = InvalidMode in { +let SVETargetGuard = "sve2,sve-bitperm", SMETargetGuard = InvalidMode in { def SVBDEP : SInst<"svbdep[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x">; def SVBDEP_N : SInst<"svbdep[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x">; def SVBEXT : SInst<"svbext[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bext_x">; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 80360216c9503..bbf5c0e7e7fd1 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5756,6 +5756,8 @@ def print_multi_directory : Flag<["-", "--"], "print-multi-directory">; def print_multi_lib : Flag<["-", "--"], "print-multi-lib">; def print_multi_flags : Flag<["-", "--"], "print-multi-flags-experimental">, HelpText<"Print the flags used for selecting multilibs (experimental)">; +def fmultilib_flag : Joined<["-", "--"], "fmultilib-flag=">, + Visibility<[ClangOption]>; def print_multi_os_directory : Flag<["-", "--"], "print-multi-os-directory">, Flags<[Unsupported]>; def print_target_triple : Flag<["-", "--"], "print-target-triple">, diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h index 8ed17179c9824..604e42067a3f1 100644 --- a/clang/include/clang/Frontend/Utils.h +++ b/clang/include/clang/Frontend/Utils.h @@ -120,7 +120,6 @@ class DependencyFileGenerator : public DependencyCollector { private: void outputDependencyFile(DiagnosticsEngine &Diags); - llvm::IntrusiveRefCntPtr FS; std::string OutputFile; std::vector Targets; bool IncludeSystemHeaders; diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index 4fa5fbdb5a7f6..e1faab205f647 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -1099,6 +1099,13 @@ enum AttributeDeclKind { ExpectedFunctionVariableOrClass, ExpectedKernelFunction, ExpectedFunctionWithProtoType, + ExpectedForLoopStatement, + ExpectedVirtualFunction, + ExpectedParameterOrImplicitObjectParameter, + ExpectedNonMemberFunction, + ExpectedFunctionOrClassOrEnum, + ExpectedClass, + ExpectedTypedef, }; inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB, diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index 3d1cc4fab1c10..a056a96f50233 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -1148,7 +1148,12 @@ class SemaOpenMP : public SemaBase { SourceLocation OmpAllMemoryLoc; SourceLocation StepModifierLoc; /// 'step' modifier location for linear clause - OpenMPAllocateClauseModifier AllocClauseModifier = OMPC_ALLOCATE_unknown; + SmallVector + AllocClauseModifiers; + SmallVector + AllocClauseModifiersLoc; + Expr *AllocateAlignment = nullptr; }; OMPClause *ActOnOpenMPVarListClause(OpenMPClauseKind Kind, @@ -1166,10 +1171,15 @@ class SemaOpenMP : public SemaBase { SourceLocation LParenLoc, SourceLocation EndLoc); /// Called on well-formed 'allocate' clause. - OMPClause *ActOnOpenMPAllocateClause( - Expr *Allocator, OpenMPAllocateClauseModifier ACModifier, - ArrayRef VarList, SourceLocation StartLoc, - SourceLocation ColonLoc, SourceLocation LParenLoc, SourceLocation EndLoc); + OMPClause * + ActOnOpenMPAllocateClause(Expr *Allocator, Expr *Alignment, + OpenMPAllocateClauseModifier FirstModifier, + SourceLocation FirstModifierLoc, + OpenMPAllocateClauseModifier SecondModifier, + SourceLocation SecondModifierLoc, + ArrayRef VarList, SourceLocation StartLoc, + SourceLocation ColonLoc, SourceLocation LParenLoc, + SourceLocation EndLoc); /// Called on well-formed 'private' clause. OMPClause *ActOnOpenMPPrivateClause(ArrayRef VarList, SourceLocation StartLoc, diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 26d33b0d94795..dec4c7221bc77 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -3165,6 +3165,7 @@ ExpectedDecl ASTNodeImporter::VisitRecordDecl(RecordDecl *D) { if (Error Err = ImportImplicitMethods(DCXX, FoundCXX)) return std::move(Err); } + // FIXME: We can return FoundDef here. } PrevDecl = FoundRecord->getMostRecentDecl(); break; @@ -9064,9 +9065,26 @@ ASTImporter::findDeclsInToCtx(DeclContext *DC, DeclarationName Name) { // We can diagnose this only if we search in the redecl context. DeclContext *ReDC = DC->getRedeclContext(); if (SharedState->getLookupTable()) { - ASTImporterLookupTable::LookupResult LookupResult = - SharedState->getLookupTable()->lookup(ReDC, Name); - return FoundDeclsTy(LookupResult.begin(), LookupResult.end()); + if (ReDC->isNamespace()) { + // Namespaces can be reopened. + // Lookup table does not handle this, we must search here in all linked + // namespaces. + FoundDeclsTy Result; + SmallVector NSChain = + getCanonicalForwardRedeclChain( + dyn_cast(ReDC)); + for (auto *D : NSChain) { + ASTImporterLookupTable::LookupResult LookupResult = + SharedState->getLookupTable()->lookup(dyn_cast(D), + Name); + Result.append(LookupResult.begin(), LookupResult.end()); + } + return Result; + } else { + ASTImporterLookupTable::LookupResult LookupResult = + SharedState->getLookupTable()->lookup(ReDC, Name); + return FoundDeclsTy(LookupResult.begin(), LookupResult.end()); + } } else { DeclContext::lookup_result NoloadLookupResult = ReDC->noload_lookup(Name); FoundDeclsTy Result(NoloadLookupResult.begin(), NoloadLookupResult.end()); diff --git a/clang/lib/AST/ASTImporterLookupTable.cpp b/clang/lib/AST/ASTImporterLookupTable.cpp index 07d39dcee2583..4ed3198d7ea62 100644 --- a/clang/lib/AST/ASTImporterLookupTable.cpp +++ b/clang/lib/AST/ASTImporterLookupTable.cpp @@ -115,8 +115,9 @@ void ASTImporterLookupTable::remove(DeclContext *DC, NamedDecl *ND) { #ifndef NDEBUG if (!EraseResult) { std::string Message = - llvm::formatv("Trying to remove not contained Decl '{0}' of type {1}", - Name.getAsString(), DC->getDeclKindName()) + llvm::formatv( + "Trying to remove not contained Decl '{0}' of type {1} from a {2}", + Name.getAsString(), ND->getDeclKindName(), DC->getDeclKindName()) .str(); llvm_unreachable(Message.c_str()); } @@ -125,18 +126,18 @@ void ASTImporterLookupTable::remove(DeclContext *DC, NamedDecl *ND) { void ASTImporterLookupTable::add(NamedDecl *ND) { assert(ND); - DeclContext *DC = ND->getDeclContext()->getPrimaryContext(); + DeclContext *DC = ND->getDeclContext(); add(DC, ND); - DeclContext *ReDC = DC->getRedeclContext()->getPrimaryContext(); + DeclContext *ReDC = DC->getRedeclContext(); if (DC != ReDC) add(ReDC, ND); } void ASTImporterLookupTable::remove(NamedDecl *ND) { assert(ND); - DeclContext *DC = ND->getDeclContext()->getPrimaryContext(); + DeclContext *DC = ND->getDeclContext(); remove(DC, ND); - DeclContext *ReDC = DC->getRedeclContext()->getPrimaryContext(); + DeclContext *ReDC = DC->getRedeclContext(); if (DC != ReDC) remove(ReDC, ND); } @@ -161,7 +162,7 @@ void ASTImporterLookupTable::updateForced(NamedDecl *ND, DeclContext *OldDC) { ASTImporterLookupTable::LookupResult ASTImporterLookupTable::lookup(DeclContext *DC, DeclarationName Name) const { - auto DCI = LookupTable.find(DC->getPrimaryContext()); + auto DCI = LookupTable.find(DC); if (DCI == LookupTable.end()) return {}; @@ -178,7 +179,7 @@ bool ASTImporterLookupTable::contains(DeclContext *DC, NamedDecl *ND) const { } void ASTImporterLookupTable::dump(DeclContext *DC) const { - auto DCI = LookupTable.find(DC->getPrimaryContext()); + auto DCI = LookupTable.find(DC); if (DCI == LookupTable.end()) llvm::errs() << "empty\n"; const auto &FoundNameMap = DCI->second; @@ -196,8 +197,7 @@ void ASTImporterLookupTable::dump(DeclContext *DC) const { void ASTImporterLookupTable::dump() const { for (const auto &Entry : LookupTable) { DeclContext *DC = Entry.first; - StringRef Primary = DC->getPrimaryContext() ? " primary" : ""; - llvm::errs() << "== DC:" << cast(DC) << Primary << "\n"; + llvm::errs() << "== DC:" << cast(DC) << "\n"; dump(DC); } } diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index 4246ba95d827f..532933d6183ce 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -1019,19 +1019,18 @@ OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) { return new (C) OMPPartialClause(); } -OMPAllocateClause * -OMPAllocateClause::Create(const ASTContext &C, SourceLocation StartLoc, - SourceLocation LParenLoc, Expr *Allocator, - SourceLocation ColonLoc, - OpenMPAllocateClauseModifier AllocatorModifier, - SourceLocation AllocatorModifierLoc, - SourceLocation EndLoc, ArrayRef VL) { +OMPAllocateClause *OMPAllocateClause::Create( + const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, + Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc, + OpenMPAllocateClauseModifier Modifier1, SourceLocation Modifier1Loc, + OpenMPAllocateClauseModifier Modifier2, SourceLocation Modifier2Loc, + SourceLocation EndLoc, ArrayRef VL) { // Allocate space for private variables and initializer expressions. void *Mem = C.Allocate(totalSizeToAlloc(VL.size())); auto *Clause = new (Mem) OMPAllocateClause( - StartLoc, LParenLoc, Allocator, ColonLoc, AllocatorModifier, - AllocatorModifierLoc, EndLoc, VL.size()); + StartLoc, LParenLoc, Allocator, Alignment, ColonLoc, Modifier1, + Modifier1Loc, Modifier2, Modifier2Loc, EndLoc, VL.size()); Clause->setVarRefs(VL); return Clause; @@ -2245,21 +2244,48 @@ void OMPClausePrinter::VisitOMPClauseList(T *Node, char StartSym) { void OMPClausePrinter::VisitOMPAllocateClause(OMPAllocateClause *Node) { if (Node->varlist_empty()) return; + + Expr *FirstModifier = nullptr; + Expr *SecondModifier = nullptr; + auto FirstAllocMod = Node->getFirstAllocateModifier(); + auto SecondAllocMod = Node->getSecondAllocateModifier(); + bool FirstUnknown = FirstAllocMod == OMPC_ALLOCATE_unknown; + bool SecondUnknown = SecondAllocMod == OMPC_ALLOCATE_unknown; + if (FirstAllocMod == OMPC_ALLOCATE_allocator || + (FirstAllocMod == OMPC_ALLOCATE_unknown && Node->getAllocator())) { + FirstModifier = Node->getAllocator(); + SecondModifier = Node->getAlignment(); + } else { + FirstModifier = Node->getAlignment(); + SecondModifier = Node->getAllocator(); + } + OS << "allocate"; - OpenMPAllocateClauseModifier Modifier = Node->getAllocatorModifier(); - if (Expr *Allocator = Node->getAllocator()) { + // If we have any explicit modifiers. + if (FirstModifier) { OS << "("; - if (Modifier == OMPC_ALLOCATE_allocator) { - OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), Modifier); + if (!FirstUnknown) { + OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), FirstAllocMod); OS << "("; - Allocator->printPretty(OS, nullptr, Policy, 0); + } + FirstModifier->printPretty(OS, nullptr, Policy, 0); + if (!FirstUnknown) OS << ")"; - } else { - Allocator->printPretty(OS, nullptr, Policy, 0); + if (SecondModifier) { + OS << ", "; + if (!SecondUnknown) { + OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), + SecondAllocMod); + OS << "("; + } + SecondModifier->printPretty(OS, nullptr, Policy, 0); + if (!SecondUnknown) + OS << ")"; } OS << ":"; VisitOMPClauseList(Node, ' '); } else { + // No modifiers. Just print the variable list. VisitOMPClauseList(Node, '('); } OS << ")"; diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp index 050daee1168d4..13cd7e26dc16f 100644 --- a/clang/lib/Analysis/ThreadSafetyCommon.cpp +++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp @@ -336,7 +336,7 @@ til::SExpr *SExprBuilder::translateDeclRefExpr(const DeclRefExpr *DRE, : (cast(D)->getCanonicalDecl() == Canonical)) { // Substitute call arguments for references to function parameters if (const Expr *const *FunArgs = - Ctx->FunArgs.dyn_cast()) { + dyn_cast(Ctx->FunArgs)) { assert(I < Ctx->NumArgs); return translate(FunArgs[I], Ctx->Prev); } diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 1bf58661d0efc..4e211deb9faba 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -485,7 +485,7 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSVE2 && HasSVEAES) Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1"); - if (HasSVE2 && HasSVE2BitPerm) + if (HasSVE2 && HasSVEBitPerm) Builder.defineMacro("__ARM_FEATURE_SVE2_BITPERM", "1"); if (HasSVE2 && HasSVE2SHA3) @@ -769,7 +769,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("f64mm", FPU & SveMode && HasMatmulFP64) .Case("sve2", FPU & SveMode && HasSVE2) .Case("sve-aes", HasSVEAES) - .Case("sve2-bitperm", FPU & SveMode && HasSVE2BitPerm) + .Case("sve-bitperm", FPU & HasSVEBitPerm) .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) .Case("sve2p1", FPU & SveMode && HasSVE2p1) @@ -881,12 +881,10 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, } if (Feature == "+sve-b16b16") HasSVEB16B16 = true; - if (Feature == "+sve2-bitperm") { + if (Feature == "+sve-bitperm") { FPU |= NeonMode; - FPU |= SveMode; HasFullFP16 = true; - HasSVE2 = true; - HasSVE2BitPerm = true; + HasSVEBitPerm = true; } if (Feature == "+f32mm") { FPU |= NeonMode; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index cedf3286806ac..ecf80b23a508c 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -82,7 +82,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasSVE2SHA3 = false; bool HasSVE2SM4 = false; bool HasSVEB16B16 = false; - bool HasSVE2BitPerm = false; + bool HasSVEBitPerm = false; bool HasMatmulFP64 = false; bool HasMatmulFP32 = false; bool HasLSE = false; diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index bcf6db1467ffc..79e6bf3d24dff 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -85,6 +85,7 @@ #include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include #include #include using namespace clang; @@ -119,6 +120,9 @@ static cl::opt ClPGOColdFuncAttr( extern cl::opt ProfileCorrelate; } // namespace llvm +namespace clang { +extern llvm::cl::opt ClSanitizeGuardChecks; +} namespace { @@ -1023,6 +1027,14 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PB.registerScalarOptimizerLateEPCallback([this](FunctionPassManager &FPM, OptimizationLevel Level) { BoundsCheckingPass::Options Options; + if (CodeGenOpts.SanitizeSkipHotCutoffs[SanitizerKind::SO_LocalBounds] || + ClSanitizeGuardChecks) { + static_assert(SanitizerKind::SO_LocalBounds <= + std::numeric_limits< + decltype(Options.GuardKind)::value_type>::max(), + "Update type of llvm.allow.ubsan.check."); + Options.GuardKind = SanitizerKind::SO_LocalBounds; + } Options.Merge = CodeGenOpts.SanitizeMergeHandlers.has(SanitizerKind::LocalBounds); if (!CodeGenOpts.SanitizeTrap.has(SanitizerKind::LocalBounds)) { diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 0fde4d8ee296b..e0cf6ca69f0df 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -4507,7 +4507,7 @@ void CodeGenFunction::EmitCallArgs( // First, if a prototype was provided, use those argument types. bool IsVariadic = false; if (Prototype.P) { - const auto *MD = Prototype.P.dyn_cast(); + const auto *MD = dyn_cast(Prototype.P); if (MD) { IsVariadic = MD->isVariadic(); ExplicitCC = getCallingConventionForDecl( diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 060d02b7f1487..6e5a21c8f01e7 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -52,11 +52,13 @@ using namespace clang; using namespace CodeGen; +namespace clang { // TODO: Introduce frontend options to enabled per sanitizers, similar to // `fsanitize-trap`. -static llvm::cl::opt ClSanitizeGuardChecks( +llvm::cl::opt ClSanitizeGuardChecks( "ubsan-guard-checks", llvm::cl::Optional, llvm::cl::desc("Guard UBSAN checks with `llvm.allow.ubsan.check()`.")); +} // namespace clang //===--------------------------------------------------------------------===// // Defines for metadata diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index ee10e586d9250..4ba8ee1ca17d4 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -332,7 +332,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) { EmitOMPMasterTaskLoopDirective(cast(*S)); break; case Stmt::OMPMaskedTaskLoopDirectiveClass: - llvm_unreachable("masked taskloop directive not supported yet."); + EmitOMPMaskedTaskLoopDirective(cast(*S)); break; case Stmt::OMPMasterTaskLoopSimdDirectiveClass: EmitOMPMasterTaskLoopSimdDirective( @@ -760,6 +760,8 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { bool noinline = false; bool alwaysinline = false; bool noconvergent = false; + HLSLControlFlowHintAttr::Spelling flattenOrBranch = + HLSLControlFlowHintAttr::SpellingNotCalculated; const CallExpr *musttail = nullptr; for (const auto *A : S.getAttrs()) { @@ -791,6 +793,9 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { Builder.CreateAssumption(AssumptionVal); } } break; + case attr::HLSLControlFlowHint: { + flattenOrBranch = cast(A)->getSemanticSpelling(); + } break; } } SaveAndRestore save_nomerge(InNoMergeAttributedStmt, nomerge); @@ -798,6 +803,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline); SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent); SaveAndRestore save_musttail(MustTailCall, musttail); + SaveAndRestore save_flattenOrBranch(HLSLControlFlowAttr, flattenOrBranch); EmitStmt(S.getSubStmt(), S.getAttrs()); } diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 94daf059edba0..2b4ca65e169a6 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -7982,6 +7982,18 @@ void CodeGenFunction::EmitOMPMasterTaskLoopDirective( CGM.getOpenMPRuntime().emitMasterRegion(*this, CodeGen, S.getBeginLoc()); } +void CodeGenFunction::EmitOMPMaskedTaskLoopDirective( + const OMPMaskedTaskLoopDirective &S) { + auto &&CodeGen = [this, &S](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + EmitOMPTaskLoopBasedDirective(S); + }; + auto LPCRegion = + CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S); + OMPLexicalScope Scope(*this, S, std::nullopt, /*EmitPreInitStmt=*/false); + CGM.getOpenMPRuntime().emitMaskedRegion(*this, CodeGen, S.getBeginLoc()); +} + void CodeGenFunction::EmitOMPMasterTaskLoopSimdDirective( const OMPMasterTaskLoopSimdDirective &S) { auto &&CodeGen = [this, &S](CodeGenFunction &CGF, PrePostActionTy &Action) { diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index d6f3716afabdf..11fdddba1144b 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/FPEnv.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" @@ -2086,7 +2087,30 @@ void CodeGenFunction::EmitBranchOnBoolExpr( Weights = createProfileWeights(TrueCount, CurrentCount - TrueCount); } - Builder.CreateCondBr(CondV, TrueBlock, FalseBlock, Weights, Unpredictable); + llvm::Instruction *BrInst = Builder.CreateCondBr(CondV, TrueBlock, FalseBlock, + Weights, Unpredictable); + switch (HLSLControlFlowAttr) { + case HLSLControlFlowHintAttr::Microsoft_branch: + case HLSLControlFlowHintAttr::Microsoft_flatten: { + llvm::MDBuilder MDHelper(CGM.getLLVMContext()); + + llvm::ConstantInt *BranchHintConstant = + HLSLControlFlowAttr == + HLSLControlFlowHintAttr::Spelling::Microsoft_branch + ? llvm::ConstantInt::get(CGM.Int32Ty, 1) + : llvm::ConstantInt::get(CGM.Int32Ty, 2); + + SmallVector Vals( + {MDHelper.createString("hlsl.controlflow.hint"), + MDHelper.createConstant(BranchHintConstant)}); + BrInst->setMetadata("hlsl.controlflow.hint", + llvm::MDNode::get(CGM.getLLVMContext(), Vals)); + break; + } + // This is required to avoid warnings during compilation + case HLSLControlFlowHintAttr::SpellingNotCalculated: + break; + } } /// ErrorUnsupported - Print out an error that codegen doesn't support the diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 86328db345508..b115c15bf01a9 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -615,6 +615,10 @@ class CodeGenFunction : public CodeGenTypeCache { /// True if the current statement has noconvergent attribute. bool InNoConvergentAttributedStmt = false; + /// HLSL Branch attribute. + HLSLControlFlowHintAttr::Spelling HLSLControlFlowAttr = + HLSLControlFlowHintAttr::SpellingNotCalculated; + // The CallExpr within the current statement that the musttail attribute // applies to. nullptr if there is no 'musttail' on the current statement. const CallExpr *MustTailCall = nullptr; @@ -3870,6 +3874,7 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S); void EmitOMPTaskLoopSimdDirective(const OMPTaskLoopSimdDirective &S); void EmitOMPMasterTaskLoopDirective(const OMPMasterTaskLoopDirective &S); + void EmitOMPMaskedTaskLoopDirective(const OMPMaskedTaskLoopDirective &S); void EmitOMPMasterTaskLoopSimdDirective(const OMPMasterTaskLoopSimdDirective &S); void diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 10df730744b08..9a947f32283c3 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6398,6 +6398,11 @@ std::string Driver::GetFilePath(StringRef Name, const ToolChain &TC) const { if (auto P = SearchPaths(TC.getFilePaths())) return *P; + SmallString<128> R2(ResourceDir); + llvm::sys::path::append(R2, "..", "..", Name); + if (llvm::sys::fs::exists(Twine(R2))) + return std::string(R2); + return std::string(Name); } diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 2b4df64f2789d..acf9d264d631b 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -196,6 +196,15 @@ bool ToolChain::defaultToIEEELongDouble() const { return PPC_LINUX_DEFAULT_IEEELONGDOUBLE && getTriple().isOSLinux(); } +static void processMultilibCustomFlags(Multilib::flags_list &List, + const llvm::opt::ArgList &Args) { + for (const Arg *MultilibFlagArg : + Args.filtered(options::OPT_fmultilib_flag)) { + List.push_back(MultilibFlagArg->getAsString(Args)); + MultilibFlagArg->claim(); + } +} + static void getAArch64MultilibFlags(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args, @@ -246,6 +255,8 @@ static void getAArch64MultilibFlags(const Driver &D, if (ABIArg) { Result.push_back(ABIArg->getAsString(Args)); } + + processMultilibCustomFlags(Result, Args); } static void getARMMultilibFlags(const Driver &D, @@ -313,6 +324,7 @@ static void getARMMultilibFlags(const Driver &D, if (Endian->getOption().matches(options::OPT_mbig_endian)) Result.push_back(Endian->getAsString(Args)); } + processMultilibCustomFlags(Result, Args); } static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple, diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp index 8a36d835d82b3..15fa7de35df97 100644 --- a/clang/lib/Frontend/DependencyFile.cpp +++ b/clang/lib/Frontend/DependencyFile.cpp @@ -23,10 +23,8 @@ #include "llvm/ADT/StringSet.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" -#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include -#include using namespace clang; @@ -238,7 +236,6 @@ void DependencyFileGenerator::attachToPreprocessor(Preprocessor &PP) { PP.SetSuppressIncludeNotFoundError(true); DependencyCollector::attachToPreprocessor(PP); - FS = PP.getFileManager().getVirtualFileSystemPtr(); } bool DependencyFileGenerator::sawDependency(StringRef Filename, bool FromModule, @@ -315,22 +312,11 @@ void DependencyFileGenerator::finishedMainFile(DiagnosticsEngine &Diags) { /// https://msdn.microsoft.com/en-us/library/dd9y37ha.aspx for NMake info, /// https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx /// for Windows file-naming info. -static void printFilename(raw_ostream &OS, llvm::vfs::FileSystem *FS, - StringRef Filename, +static void PrintFilename(raw_ostream &OS, StringRef Filename, DependencyOutputFormat OutputFormat) { // Convert filename to platform native path llvm::SmallString<256> NativePath; llvm::sys::path::native(Filename.str(), NativePath); - // Resolve absolute path. Make and Ninja canonicalize paths - // without checking for symbolic links in the path, for performance concerns. - // If there is something like `/bin/../lib64` -> `/usr/lib64` - // (where `/bin` links to `/usr/bin`), Make will see them as `/lib64`. - if (FS != nullptr && llvm::sys::path::is_absolute(NativePath)) { - llvm::SmallString<256> NativePathTmp = NativePath; - std::error_code EC = FS->getRealPath(NativePathTmp, NativePath); - if (EC) - NativePath = NativePathTmp; - } if (OutputFormat == DependencyOutputFormat::NMake) { // Add quotes if needed. These are the characters listed as "special" to @@ -414,7 +400,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) { Columns = 2; } OS << ' '; - printFilename(OS, FS.get(), File, OutputFormat); + PrintFilename(OS, File, OutputFormat); Columns += N + 1; } OS << '\n'; @@ -425,7 +411,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) { for (auto I = Files.begin(), E = Files.end(); I != E; ++I) { if (Index++ == InputFileIndex) continue; - printFilename(OS, FS.get(), *I, OutputFormat); + PrintFilename(OS, *I, OutputFormat); OS << ":\n"; } } diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 7f3f6d568e28c..f136d5007e8a5 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -24,6 +24,7 @@ #include "clang/Parse/RAIIObjectsForParser.h" #include "clang/Sema/EnterExpressionEvaluationContext.h" #include "clang/Sema/Lookup.h" +#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/SemaCUDA.h" @@ -3708,9 +3709,9 @@ void Parser::ParseDeclarationSpecifiers( continue; if (PA.getKind() == ParsedAttr::AT_LifetimeBound) - Diag(PA.getLoc(), diag::err_attribute_wrong_decl_type_str) + Diag(PA.getLoc(), diag::err_attribute_wrong_decl_type) << PA << PA.isRegularKeywordAttribute() - << "parameters and implicit object parameters"; + << ExpectedParameterOrImplicitObjectParameter; else Diag(PA.getLoc(), diag::err_attribute_not_type_attr) << PA << PA.isRegularKeywordAttribute(); diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index b4e973bc84a7b..89b83938f352d 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -4530,32 +4530,88 @@ static bool parseStepSize(Parser &P, SemaOpenMP::OpenMPVarListDataTy &Data, } /// Parse 'allocate' clause modifiers. -/// If allocator-modifier exists, return an expression for it and set -/// Data field noting modifier was specified. -/// +/// If allocator-modifier exists, return an expression for it. For both +/// allocator and align modifiers, set Data fields as appropriate. static ExprResult parseOpenMPAllocateClauseModifiers(Parser &P, OpenMPClauseKind Kind, SemaOpenMP::OpenMPVarListDataTy &Data) { const Token &Tok = P.getCurToken(); Preprocessor &PP = P.getPreprocessor(); ExprResult Tail; - auto Modifier = static_cast( + ExprResult Val; + SourceLocation RLoc; + bool AllocatorSeen = false; + bool AlignSeen = false; + SourceLocation CurrentModifierLoc = Tok.getLocation(); + auto CurrentModifier = static_cast( getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok), P.getLangOpts())); - if (Modifier == OMPC_ALLOCATE_allocator) { - Data.AllocClauseModifier = Modifier; + + // Modifiers did not exist before 5.1 + if (P.getLangOpts().OpenMP < 51) + return P.ParseAssignmentExpression(); + + // An allocator-simple-modifier is exclusive and must appear alone. See + // OpenMP6.0 spec, pg. 313, L1 on Modifiers, as well as Table 5.1, pg. 50, + // description of "exclusive" property. If we don't recognized an explicit + // simple-/complex- modifier, assume we're looking at expression + // representing allocator and consider ourselves done. + if (CurrentModifier == OMPC_ALLOCATE_unknown) + return P.ParseAssignmentExpression(); + + do { P.ConsumeToken(); - BalancedDelimiterTracker AllocateT(P, tok::l_paren, - tok::annot_pragma_openmp_end); if (Tok.is(tok::l_paren)) { - AllocateT.consumeOpen(); - Tail = P.ParseAssignmentExpression(); - AllocateT.consumeClose(); + switch (CurrentModifier) { + case OMPC_ALLOCATE_allocator: { + if (AllocatorSeen) { + P.Diag(Tok, diag::err_omp_duplicate_modifier) + << getOpenMPSimpleClauseTypeName(OMPC_allocate, CurrentModifier) + << getOpenMPClauseName(Kind); + } else { + Data.AllocClauseModifiers.push_back(CurrentModifier); + Data.AllocClauseModifiersLoc.push_back(CurrentModifierLoc); + } + BalancedDelimiterTracker AllocateT(P, tok::l_paren, + tok::annot_pragma_openmp_end); + AllocateT.consumeOpen(); + Tail = P.ParseAssignmentExpression(); + AllocateT.consumeClose(); + AllocatorSeen = true; + break; + } + case OMPC_ALLOCATE_align: { + if (AlignSeen) { + P.Diag(Tok, diag::err_omp_duplicate_modifier) + << getOpenMPSimpleClauseTypeName(OMPC_allocate, CurrentModifier) + << getOpenMPClauseName(Kind); + } else { + Data.AllocClauseModifiers.push_back(CurrentModifier); + Data.AllocClauseModifiersLoc.push_back(CurrentModifierLoc); + } + Val = P.ParseOpenMPParensExpr(getOpenMPClauseName(Kind), RLoc); + if (Val.isUsable()) + Data.AllocateAlignment = Val.get(); + AlignSeen = true; + break; + } + default: + llvm_unreachable("Unexpected allocate modifier"); + } } else { P.Diag(Tok, diag::err_expected) << tok::l_paren; } - } else { - Tail = P.ParseAssignmentExpression(); - } + if (Tok.isNot(tok::comma)) + break; + P.ConsumeToken(); + CurrentModifierLoc = Tok.getLocation(); + CurrentModifier = static_cast( + getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok), P.getLangOpts())); + // A modifier followed by a comma implies another modifier. + if (CurrentModifier == OMPC_ALLOCATE_unknown) { + P.Diag(Tok, diag::err_omp_expected_modifier) << getOpenMPClauseName(Kind); + break; + } + } while (!AllocatorSeen || !AlignSeen); return Tail; } @@ -4832,7 +4888,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, } else if (Kind == OMPC_allocate || (Kind == OMPC_affinity && Tok.is(tok::identifier) && PP.getSpelling(Tok) == "iterator")) { - // Handle optional allocator expression followed by colon delimiter. + // Handle optional allocator and align modifiers followed by colon + // delimiter. ColonProtectionRAIIObject ColonRAII(*this); TentativeParsingAction TPA(*this); // OpenMP 5.0, 2.10.1, task Construct. @@ -4849,19 +4906,18 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, Tail = Actions.CorrectDelayedTyposInExpr(Tail); Tail = Actions.ActOnFinishFullExpr(Tail.get(), T.getOpenLocation(), /*DiscardedValue=*/false); - if (Tail.isUsable()) { + if (Tail.isUsable() || Data.AllocateAlignment) { if (Tok.is(tok::colon)) { - Data.DepModOrTailExpr = Tail.get(); + Data.DepModOrTailExpr = Tail.isUsable() ? Tail.get() : nullptr; Data.ColonLoc = ConsumeToken(); TPA.Commit(); } else { // Colon not found, parse only list of variables. TPA.Revert(); - if (Kind == OMPC_allocate && - Data.AllocClauseModifier == OMPC_ALLOCATE_allocator) { + if (Kind == OMPC_allocate && Data.AllocClauseModifiers.size()) { SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); - Diag(Tok, diag::err_modifier_expected_colon) << "allocator"; + Diag(Tok, diag::err_modifier_expected_colon) << "allocate clause"; } } } else { diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index bb4d33560b93b..c1663f2d15c88 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -1868,8 +1868,8 @@ static void handleNakedAttr(Sema &S, Decl *D, const ParsedAttr &AL) { // This form is not allowed to be written on a member function (static or // nonstatic) when in Microsoft compatibility mode. if (S.getLangOpts().MSVCCompat && isa(D)) { - S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type_str) - << AL << AL.isRegularKeywordAttribute() << "non-member functions"; + S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type) + << AL << AL.isRegularKeywordAttribute() << ExpectedNonMemberFunction; return; } } @@ -2761,9 +2761,9 @@ static void handleWarnUnusedResult(Sema &S, Decl *D, const ParsedAttr &AL) { // The standard attribute cannot be applied to variable declarations such // as a function pointer. if (isa(D)) - S.Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type_str) + S.Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type) << AL << AL.isRegularKeywordAttribute() - << "functions, classes, or enumerations"; + << ExpectedFunctionOrClassOrEnum; // If this is spelled as the standard C++17 attribute, but not in C++17, // warn about using it as an extension. If there are attribute arguments, @@ -5555,8 +5555,8 @@ static void handleNullableTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (auto *CRD = dyn_cast(D); !CRD || !(CRD->isClass() || CRD->isStruct())) { - S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type_str) - << AL << AL.isRegularKeywordAttribute() << "classes"; + S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type) + << AL << AL.isRegularKeywordAttribute() << ExpectedClass; return; } diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 66ff92f554fc4..b83b2b12f4a23 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -5320,6 +5320,8 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack, Expr *SimpleRefExpr = E; auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange); ValueDecl *VD = Res.first; + if (!VD) + continue; DSAStackTy::DSAVarData Data = Stack->getTopDSA(VD, /*FromParent=*/false); if (!isOpenMPPrivate(Data.CKind)) { S.Diag(E->getExprLoc(), @@ -5330,10 +5332,8 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack, if (checkPreviousOMPAllocateAttribute(S, Stack, E, PrivateVD, AllocatorKind, AC->getAllocator())) continue; - // Placeholder until allocate clause supports align modifier. - Expr *Alignment = nullptr; applyOMPAllocateAttribute(S, PrivateVD, AllocatorKind, AC->getAllocator(), - Alignment, E->getSourceRange()); + AC->getAlignment(), E->getSourceRange()); } } } @@ -15617,7 +15617,9 @@ ExprResult SemaOpenMP::VerifyPositiveIntegerConstantInClause( << E->getSourceRange(); return ExprError(); } - if ((CKind == OMPC_aligned || CKind == OMPC_align) && !Result.isPowerOf2()) { + if ((CKind == OMPC_aligned || CKind == OMPC_align || + CKind == OMPC_allocate) && + !Result.isPowerOf2()) { Diag(E->getExprLoc(), diag::warn_omp_alignment_not_power_of_two) << E->getSourceRange(); return ExprError(); @@ -17153,11 +17155,26 @@ OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind, case OMPC_has_device_addr: Res = ActOnOpenMPHasDeviceAddrClause(VarList, Locs); break; - case OMPC_allocate: - Res = ActOnOpenMPAllocateClause(Data.DepModOrTailExpr, - Data.AllocClauseModifier, VarList, StartLoc, - LParenLoc, ColonLoc, EndLoc); + case OMPC_allocate: { + OpenMPAllocateClauseModifier Modifier1 = OMPC_ALLOCATE_unknown; + OpenMPAllocateClauseModifier Modifier2 = OMPC_ALLOCATE_unknown; + SourceLocation Modifier1Loc, Modifier2Loc; + if (!Data.AllocClauseModifiers.empty()) { + assert(Data.AllocClauseModifiers.size() <= 2 && + "More allocate modifiers than expected"); + Modifier1 = Data.AllocClauseModifiers[0]; + Modifier1Loc = Data.AllocClauseModifiersLoc[0]; + if (Data.AllocClauseModifiers.size() == 2) { + Modifier2 = Data.AllocClauseModifiers[1]; + Modifier2Loc = Data.AllocClauseModifiersLoc[1]; + } + } + Res = ActOnOpenMPAllocateClause( + Data.DepModOrTailExpr, Data.AllocateAlignment, Modifier1, Modifier1Loc, + Modifier2, Modifier2Loc, VarList, StartLoc, LParenLoc, ColonLoc, + EndLoc); break; + } case OMPC_nontemporal: Res = ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc, EndLoc); break; @@ -23163,32 +23180,37 @@ SemaOpenMP::ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, } OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause( - Expr *Allocator, OpenMPAllocateClauseModifier AllocClauseModifier, - ArrayRef VarList, SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation ColonLoc, SourceLocation EndLoc) { - + Expr *Allocator, Expr *Alignment, + OpenMPAllocateClauseModifier FirstAllocateModifier, + SourceLocation FirstAllocateModifierLoc, + OpenMPAllocateClauseModifier SecondAllocateModifier, + SourceLocation SecondAllocateModifierLoc, ArrayRef VarList, + SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc, + SourceLocation EndLoc) { if (Allocator) { // Allocator expression is dependent - skip it for now and build the // allocator when instantiated. - if (Allocator->isTypeDependent() || Allocator->isValueDependent() || - Allocator->isInstantiationDependent() || - Allocator->containsUnexpandedParameterPack()) - return nullptr; - // OpenMP [2.11.4 allocate Clause, Description] - // allocator is an expression of omp_allocator_handle_t type. - if (!findOMPAllocatorHandleT(SemaRef, Allocator->getExprLoc(), DSAStack)) - return nullptr; + bool AllocDependent = + (Allocator->isTypeDependent() || Allocator->isValueDependent() || + Allocator->isInstantiationDependent() || + Allocator->containsUnexpandedParameterPack()); + if (!AllocDependent) { + // OpenMP [2.11.4 allocate Clause, Description] + // allocator is an expression of omp_allocator_handle_t type. + if (!findOMPAllocatorHandleT(SemaRef, Allocator->getExprLoc(), DSAStack)) + return nullptr; - ExprResult AllocatorRes = SemaRef.DefaultLvalueConversion(Allocator); - if (AllocatorRes.isInvalid()) - return nullptr; - AllocatorRes = SemaRef.PerformImplicitConversion( - AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(), - AssignmentAction::Initializing, - /*AllowExplicit=*/true); - if (AllocatorRes.isInvalid()) - return nullptr; - Allocator = AllocatorRes.get(); + ExprResult AllocatorRes = SemaRef.DefaultLvalueConversion(Allocator); + if (AllocatorRes.isInvalid()) + return nullptr; + AllocatorRes = SemaRef.PerformImplicitConversion( + AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(), + AssignmentAction::Initializing, + /*AllowExplicit=*/true); + if (AllocatorRes.isInvalid()) + return nullptr; + Allocator = AllocatorRes.isUsable() ? AllocatorRes.get() : nullptr; + } } else { // OpenMP 5.0, 2.11.4 allocate Clause, Restrictions. // allocate clauses that appear on a target construct or on constructs in a @@ -23199,6 +23221,17 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause( !DSAStack->hasRequiresDeclWithClause()) SemaRef.targetDiag(StartLoc, diag::err_expected_allocator_expression); } + if (Alignment) { + bool AlignmentDependent = Alignment->isTypeDependent() || + Alignment->isValueDependent() || + Alignment->isInstantiationDependent() || + Alignment->containsUnexpandedParameterPack(); + if (!AlignmentDependent) { + ExprResult AlignResult = + VerifyPositiveIntegerConstantInClause(Alignment, OMPC_allocate); + Alignment = AlignResult.isUsable() ? AlignResult.get() : nullptr; + } + } // Analyze and build list of variables. SmallVector Vars; for (Expr *RefExpr : VarList) { @@ -23230,11 +23263,10 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause( if (Allocator) DSAStack->addInnerAllocatorExpr(Allocator); - OpenMPAllocateClauseModifier AllocatorModifier = AllocClauseModifier; - SourceLocation AllocatorModifierLoc; - return OMPAllocateClause::Create(getASTContext(), StartLoc, LParenLoc, - Allocator, ColonLoc, AllocatorModifier, - AllocatorModifierLoc, EndLoc, Vars); + return OMPAllocateClause::Create( + getASTContext(), StartLoc, LParenLoc, Allocator, Alignment, ColonLoc, + FirstAllocateModifier, FirstAllocateModifierLoc, SecondAllocateModifier, + SecondAllocateModifierLoc, EndLoc, Vars); } OMPClause *SemaOpenMP::ActOnOpenMPNontemporalClause(ArrayRef VarList, diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index 106e2430de901..422d8abc1028a 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -619,6 +619,12 @@ static Attr *handleHLSLLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, return ::new (S.Context) HLSLLoopHintAttr(S.Context, A, UnrollFactor); } +static Attr *handleHLSLControlFlowHint(Sema &S, Stmt *St, const ParsedAttr &A, + SourceRange Range) { + + return ::new (S.Context) HLSLControlFlowHintAttr(S.Context, A); +} + static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A, SourceRange Range) { if (A.isInvalid() || A.getKind() == ParsedAttr::IgnoredAttribute) @@ -655,6 +661,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A, return handleLoopHintAttr(S, St, A, Range); case ParsedAttr::AT_HLSLLoopHint: return handleHLSLLoopHintAttr(S, St, A, Range); + case ParsedAttr::AT_HLSLControlFlowHint: + return handleHLSLControlFlowHint(S, St, A, Range); case ParsedAttr::AT_OpenCLUnrollHint: return handleOpenCLUnrollHint(S, St, A, Range); case ParsedAttr::AT_Suppress: diff --git a/clang/lib/Sema/SemaSwift.cpp b/clang/lib/Sema/SemaSwift.cpp index 24fdfb8e57dc3..fe72d6c85c37a 100644 --- a/clang/lib/Sema/SemaSwift.cpp +++ b/clang/lib/Sema/SemaSwift.cpp @@ -650,8 +650,8 @@ void SemaSwift::handleNewType(Decl *D, const ParsedAttr &AL) { } if (!isa(D)) { - Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type_str) - << AL << AL.isRegularKeywordAttribute() << "typedefs"; + Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type) + << AL << AL.isRegularKeywordAttribute() << ExpectedTypedef; return; } diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index e3ec327c1b364..2ccf5a8e1d6f3 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -7983,8 +7983,9 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, if (!FnTy) { // SME ACLE attributes are not supported on K&R-style unprototyped C // functions. - S.Diag(attr.getLoc(), diag::warn_attribute_wrong_decl_type) << - attr << attr.isRegularKeywordAttribute() << ExpectedFunctionWithProtoType; + S.Diag(attr.getLoc(), diag::warn_attribute_wrong_decl_type) + << attr << attr.isRegularKeywordAttribute() + << ExpectedFunctionWithProtoType; attr.setInvalid(); return false; } @@ -8676,9 +8677,9 @@ static void HandleLifetimeBoundAttr(TypeProcessingState &State, CurType, CurType); return; } - State.getSema().Diag(Attr.getLoc(), diag::err_attribute_wrong_decl_type_str) + State.getSema().Diag(Attr.getLoc(), diag::err_attribute_wrong_decl_type) << Attr << Attr.isRegularKeywordAttribute() - << "parameters and implicit object parameters"; + << ExpectedParameterOrImplicitObjectParameter; } static void HandleLifetimeCaptureByAttr(TypeProcessingState &State, diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 4a3c739ecbeab..4fae2ccb5f6d0 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -2075,15 +2075,18 @@ class TreeTransform { /// /// By default, performs semantic analysis to build the new OpenMP clause. /// Subclasses may override this routine to provide different behavior. - OMPClause *RebuildOMPAllocateClause(Expr *Allocate, - OpenMPAllocateClauseModifier ACModifier, - ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ColonLoc, - SourceLocation EndLoc) { + OMPClause * + RebuildOMPAllocateClause(Expr *Allocate, Expr *Alignment, + OpenMPAllocateClauseModifier FirstModifier, + SourceLocation FirstModifierLoc, + OpenMPAllocateClauseModifier SecondModifier, + SourceLocation SecondModifierLoc, + ArrayRef VarList, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation ColonLoc, + SourceLocation EndLoc) { return getSema().OpenMP().ActOnOpenMPAllocateClause( - Allocate, ACModifier, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc); + Allocate, Alignment, FirstModifier, FirstModifierLoc, SecondModifier, + SecondModifierLoc, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc); } /// Build a new OpenMP 'num_teams' clause. @@ -11224,6 +11227,13 @@ TreeTransform::TransformOMPAllocateClause(OMPAllocateClause *C) { return nullptr; Allocator = AllocatorRes.get(); } + Expr *Alignment = C->getAlignment(); + if (Alignment) { + ExprResult AlignmentRes = getDerived().TransformExpr(Alignment); + if (AlignmentRes.isInvalid()) + return nullptr; + Alignment = AlignmentRes.get(); + } llvm::SmallVector Vars; Vars.reserve(C->varlist_size()); for (auto *VE : C->varlist()) { @@ -11233,7 +11243,9 @@ TreeTransform::TransformOMPAllocateClause(OMPAllocateClause *C) { Vars.push_back(EVar.get()); } return getDerived().RebuildOMPAllocateClause( - Allocator, C->getAllocatorModifier(), Vars, C->getBeginLoc(), + Allocator, Alignment, C->getFirstAllocateModifier(), + C->getFirstAllocateModifierLoc(), C->getSecondAllocateModifier(), + C->getSecondAllocateModifierLoc(), Vars, C->getBeginLoc(), C->getLParenLoc(), C->getColonLoc(), C->getEndLoc()); } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index b53f99732cacc..7361cace49dd7 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -11824,10 +11824,12 @@ void OMPClauseReader::VisitOMPMapClause(OMPMapClause *C) { } void OMPClauseReader::VisitOMPAllocateClause(OMPAllocateClause *C) { - C->setAllocatorModifier(Record.readEnum()); + C->setFirstAllocateModifier(Record.readEnum()); + C->setSecondAllocateModifier(Record.readEnum()); C->setLParenLoc(Record.readSourceLocation()); C->setColonLoc(Record.readSourceLocation()); C->setAllocator(Record.readSubExpr()); + C->setAlignment(Record.readSubExpr()); unsigned NumVars = C->varlist_size(); SmallVector Vars; Vars.reserve(NumVars); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 39004fd4d4c37..345d496a93312 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7924,10 +7924,12 @@ void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) { void OMPClauseWriter::VisitOMPAllocateClause(OMPAllocateClause *C) { Record.push_back(C->varlist_size()); - Record.writeEnum(C->getAllocatorModifier()); + Record.writeEnum(C->getFirstAllocateModifier()); + Record.writeEnum(C->getSecondAllocateModifier()); Record.AddSourceLocation(C->getLParenLoc()); Record.AddSourceLocation(C->getColonLoc()); Record.AddStmt(C->getAllocator()); + Record.AddStmt(C->getAlignment()); for (auto *VE : C->varlist()) Record.AddStmt(VE); } diff --git a/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl b/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl new file mode 100644 index 0000000000000..a36779c05fbc9 --- /dev/null +++ b/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-compute -ast-dump %s | FileCheck %s + +// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used branch 'int (int)' +// CHECK: AttributedStmt 0x{{[0-9A-Fa-f]+}} < +// CHECK-NEXT: -HLSLControlFlowHintAttr 0x{{[0-9A-Fa-f]+}} <{{.*}}> branch +export int branch(int X){ + int resp; + [branch] if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used flatten 'int (int)' +// CHECK: AttributedStmt 0x{{[0-9A-Fa-f]+}} < +// CHECK-NEXT: -HLSLControlFlowHintAttr 0x{{[0-9A-Fa-f]+}} <{{.*}}> flatten +export int flatten(int X){ + int resp; + [flatten] if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used no_attr 'int (int)' +// CHECK-NOT: AttributedStmt 0x{{[0-9A-Fa-f]+}} < +// CHECK-NOT: -HLSLControlFlowHintAttr +export int no_attr(int X){ + int resp; + if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp index 98eb86c929009..15bcc20b7fa2a 100644 --- a/clang/test/CXX/drs/cwg1xx.cpp +++ b/clang/test/CXX/drs/cwg1xx.cpp @@ -1076,6 +1076,26 @@ namespace cwg169 { // cwg169: 3.4 }; } // namespace cwg169 +namespace cwg170 { // cwg170: 3.1 +#if __cplusplus >= 201103L +struct A {}; +struct B : A { int i; }; +struct C : A {}; +struct D : C {}; + +constexpr int f(int A::*) { return 0; } +constexpr int g(int C::*) { return 0; } +constexpr int h(int D::*) { return 0; } + +constexpr auto p = static_cast(&B::i); +constexpr auto q = f(p); +constexpr auto r = g(p); +// since-cxx11-error@-1 {{constexpr variable 'r' must be initialized by a constant expression}} +constexpr auto s = h(p); +// since-cxx11-error@-1 {{constexpr variable 's' must be initialized by a constant expression}} +#endif +} // namespace cwg170 + namespace { // cwg171: 3.4 int cwg171a; } diff --git a/clang/test/CodeGen/AArch64/fmv-dependencies.c b/clang/test/CodeGen/AArch64/fmv-dependencies.c index 097b85e989d86..8dda3b647fcd0 100644 --- a/clang/test/CodeGen/AArch64/fmv-dependencies.c +++ b/clang/test/CodeGen/AArch64/fmv-dependencies.c @@ -192,7 +192,7 @@ int caller() { // CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" // CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" // CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+sve2-aes,+v8a" -// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a" +// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-bitperm,+sve2,+sve2-bitperm,+v8a" // CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a" // CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sm4,+sve,+sve2,+sve2-sm4,+v8a" // CHECK: attributes #[[wfxt]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+wfxt" diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c index 950a19115811e..2f3994df03784 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c @@ -1,12 +1,12 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c new file mode 100644 index 0000000000000..7c70bcf6b4d66 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c @@ -0,0 +1,3182 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#ifdef __ARM_FEATURE_SME +#include +#else +#include +#endif + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1, A2) A1##A2 +#endif + +#ifdef __ARM_FEATURE_SME +#define STREAMING __arm_streaming +#else +#define STREAMING +#endif + +// CHECK-LABEL: define dso_local @test_svreinterpret_s8_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z25test_svreinterpret_s8_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svint8_t test_svreinterpret_s8_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s8, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_u8_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z25test_svreinterpret_u8_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svuint8_t test_svreinterpret_u8_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u8, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_s8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z25test_svreinterpret_mf8_s8u10__SVInt8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svmfloat8_t test_svreinterpret_mf8_s8(svint8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_u8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z25test_svreinterpret_mf8_u8u11__SVUint8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svmfloat8_t test_svreinterpret_mf8_u8(svuint8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svmfloat8_t test_svreinterpret_mf8_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_s16( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_s16u11__SVInt16_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_s16(svint16_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s16)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_u16( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_u16u12__SVUint16_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_u16(svuint16_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u16)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_s32( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_s32u11__SVInt32_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_s32(svint32_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s32)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_u32( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_u32u12__SVUint32_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_u32(svuint32_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u32)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_s64( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_s64u11__SVInt64_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_s64(svint64_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s64)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_u64( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_u64u12__SVUint64_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_u64(svuint64_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u64)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_f16( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_f16u13__SVFloat16_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_f16(svfloat16_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f16)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_bf16( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z27test_svreinterpret_mf8_bf16u14__SVBfloat16_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_bf16(svbfloat16_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_f32( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_f32u13__SVFloat32_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_f32(svfloat32_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f32)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_f64( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_f64u13__SVFloat64_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_f64(svfloat64_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f64)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_s16_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_s16_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svint16_t test_svreinterpret_s16_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s16, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_u16_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_u16_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svuint16_t test_svreinterpret_u16_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u16, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_s32_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_s32_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svint32_t test_svreinterpret_s32_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s32, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_u32_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_u32_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svuint32_t test_svreinterpret_u32_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u32, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_s64_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_s64_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svint64_t test_svreinterpret_s64_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s64, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_u64_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_u64_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svuint64_t test_svreinterpret_u64_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u64, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_f16_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_f16_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svfloat16_t test_svreinterpret_f16_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f16, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_bf16_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z27test_svreinterpret_bf16_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svreinterpret_bf16_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_f32_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_f32_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svfloat32_t test_svreinterpret_f32_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f32, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_f64_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_f64_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svfloat64_t test_svreinterpret_f64_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f64, _mf8)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_s8_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z28test_svreinterpret_s8_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svint8x2_t test_svreinterpret_s8_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_u8_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z28test_svreinterpret_u8_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svuint8x2_t test_svreinterpret_u8_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_s8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z28test_svreinterpret_mf8_s8_x210svint8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svmfloat8x2_t test_svreinterpret_mf8_s8_x2(svint8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_u8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z28test_svreinterpret_mf8_u8_x211svuint8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svmfloat8x2_t test_svreinterpret_mf8_u8_x2(svuint8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svmfloat8x2_t test_svreinterpret_mf8_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x2)(op); +} + +// +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_s16_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_s16_x211svint16x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_s16_x2(svint16x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_u16_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_u16_x212svuint16x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_u16_x2(svuint16x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_s32_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_s32_x211svint32x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_s32_x2(svint32x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_u32_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_u32_x212svuint32x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_u32_x2(svuint32x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_s64_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_s64_x211svint64x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_s64_x2(svint64x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_u64_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_u64_x212svuint64x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_u64_x2(svuint64x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_f16_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_f16_x213svfloat16x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_f16_x2(svfloat16x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_bf16_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z30test_svreinterpret_mf8_bf16_x214svbfloat16x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_bf16_x2(svbfloat16x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_f32_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_f32_x213svfloat32x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_f32_x2(svfloat32x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_f64_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_f64_x213svfloat64x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_f64_x2(svfloat64x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_s16_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_s16_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svint16x2_t test_svreinterpret_s16_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_u16_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_u16_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svuint16x2_t test_svreinterpret_u16_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_s32_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_s32_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svint32x2_t test_svreinterpret_s32_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_u32_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_u32_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svuint32x2_t test_svreinterpret_u32_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_s64_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_s64_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svint64x2_t test_svreinterpret_s64_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_u64_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_u64_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svuint64x2_t test_svreinterpret_u64_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_f16_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_f16_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svfloat16x2_t test_svreinterpret_f16_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_bf16_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z30test_svreinterpret_bf16_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svbfloat16x2_t test_svreinterpret_bf16_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_f32_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_f32_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svfloat32x2_t test_svreinterpret_f32_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_f64_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_f64_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svfloat64x2_t test_svreinterpret_f64_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_s8_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z28test_svreinterpret_s8_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svint8x3_t test_svreinterpret_s8_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_u8_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z28test_svreinterpret_u8_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svuint8x3_t test_svreinterpret_u8_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_s8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z28test_svreinterpret_mf8_s8_x310svint8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svmfloat8x3_t test_svreinterpret_mf8_s8_x3(svint8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_u8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z28test_svreinterpret_mf8_u8_x311svuint8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svmfloat8x3_t test_svreinterpret_mf8_u8_x3(svuint8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svmfloat8x3_t test_svreinterpret_mf8_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_s16_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_s16_x311svint16x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_s16_x3(svint16x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_u16_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_u16_x312svuint16x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_u16_x3(svuint16x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_s32_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_s32_x311svint32x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_s32_x3(svint32x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_u32_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_u32_x312svuint32x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_u32_x3(svuint32x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_s64_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_s64_x311svint64x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_s64_x3(svint64x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x3)(op); +} + +// +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_u64_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_u64_x312svuint64x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_u64_x3(svuint64x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_f16_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_f16_x313svfloat16x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_f16_x3(svfloat16x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_bf16_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z30test_svreinterpret_mf8_bf16_x314svbfloat16x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_bf16_x3(svbfloat16x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_f32_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_f32_x313svfloat32x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_f32_x3(svfloat32x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_f64_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_f64_x313svfloat64x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_f64_x3(svfloat64x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_s16_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_s16_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svint16x3_t test_svreinterpret_s16_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_u16_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_u16_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svuint16x3_t test_svreinterpret_u16_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_s32_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_s32_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svint32x3_t test_svreinterpret_s32_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_u32_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_u32_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svuint32x3_t test_svreinterpret_u32_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_s64_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_s64_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svint64x3_t test_svreinterpret_s64_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_u64_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_u64_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svuint64x3_t test_svreinterpret_u64_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_f16_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_f16_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svfloat16x3_t test_svreinterpret_f16_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_bf16_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z30test_svreinterpret_bf16_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svbfloat16x3_t test_svreinterpret_bf16_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_f32_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_f32_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svfloat32x3_t test_svreinterpret_f32_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_f64_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_f64_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svfloat64x3_t test_svreinterpret_f64_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_s8_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z28test_svreinterpret_s8_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svint8x4_t test_svreinterpret_s8_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_u8_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z28test_svreinterpret_u8_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svuint8x4_t test_svreinterpret_u8_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_s8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z28test_svreinterpret_mf8_s8_x410svint8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svmfloat8x4_t test_svreinterpret_mf8_s8_x4(svint8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_u8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z28test_svreinterpret_mf8_u8_x411svuint8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svmfloat8x4_t test_svreinterpret_mf8_u8_x4(svuint8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svmfloat8x4_t test_svreinterpret_mf8_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_s16_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_s16_x411svint16x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_s16_x4(svint16x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_u16_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_u16_x412svuint16x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_u16_x4(svuint16x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_s32_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_s32_x411svint32x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_s32_x4(svint32x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_u32_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_u32_x412svuint32x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_u32_x4(svuint32x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_s64_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_s64_x411svint64x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_s64_x4(svint64x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_u64_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_u64_x412svuint64x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_u64_x4(svuint64x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x4)(op); +} + +// +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_f16_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_f16_x413svfloat16x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_f16_x4(svfloat16x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_bf16_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z30test_svreinterpret_mf8_bf16_x414svbfloat16x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_bf16_x4(svbfloat16x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_f32_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_f32_x413svfloat32x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_f32_x4(svfloat32x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_f64_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_f64_x413svfloat64x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_f64_x4(svfloat64x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_s16_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_s16_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svint16x4_t test_svreinterpret_s16_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_u16_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_u16_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svuint16x4_t test_svreinterpret_u16_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_s32_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_s32_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svint32x4_t test_svreinterpret_s32_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_u32_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_u32_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svuint32x4_t test_svreinterpret_u32_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_s64_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_s64_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svint64x4_t test_svreinterpret_s64_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_u64_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_u64_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svuint64x4_t test_svreinterpret_u64_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_f16_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_f16_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svfloat16x4_t test_svreinterpret_f16_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_bf16_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z30test_svreinterpret_bf16_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svbfloat16x4_t test_svreinterpret_bf16_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_f32_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_f32_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svfloat32x4_t test_svreinterpret_f32_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_f64_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_f64_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svfloat64x4_t test_svreinterpret_f64_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x4)(op); +} diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c index a151d162e0108..2da4ab541869e 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c index fc95cf541172a..8353b3aebc9fc 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c index d7c070d412a8f..d4681394a0508 100644 --- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c index 30b798e21f7a1..6d654b9353e7a 100644 --- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c index 58445c6b810c7..a98d8e8a2b37c 100644 --- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGen/AArch64/targetattr.c b/clang/test/CodeGen/AArch64/targetattr.c index ee7a07244ef9a..f8d5f9912c0d7 100644 --- a/clang/test/CodeGen/AArch64/targetattr.c +++ b/clang/test/CodeGen/AArch64/targetattr.c @@ -204,7 +204,7 @@ void applem4() {} // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } -// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } +// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve-bitperm,+sve2,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } // CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="cortex-a710" } // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+ete,+fp-armv8,+neon,+trbe,+v8a" } // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" } diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c index fb264ce32ab99..38b4848c1edc1 100644 --- a/clang/test/CodeGen/allow-ubsan-check.c +++ b/clang/test/CodeGen/allow-ubsan-check.c @@ -174,12 +174,14 @@ void use(double*); // CHECK-NEXT: [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16 // CHECK-NEXT: call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR7:[0-9]+]] // CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 -// CHECK-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]] -// CHECK-NEXT: br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]] -// CHECK: [[BB1]]: +// CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]] +// CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]] +// CHECK-NEXT: [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]] +// CHECK: [[BB4]]: // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]] -// CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: ret double [[TMP2]] +// CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]] +// CHECK-NEXT: ret double [[TMP5]] // CHECK: [[TRAP]]: // CHECK-NEXT: call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]], !nosanitize [[META2]] // CHECK-NEXT: unreachable, !nosanitize [[META2]] @@ -191,12 +193,14 @@ void use(double*); // TR-NEXT: [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16 // TR-NEXT: call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR6:[0-9]+]] // TR-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 -// TR-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]] -// TR-NEXT: br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]] -// TR: [[BB1]]: +// TR-NEXT: [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]] +// TR-NEXT: [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]] +// TR-NEXT: [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]] +// TR-NEXT: br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]] +// TR: [[BB4]]: // TR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]] -// TR-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]] -// TR-NEXT: ret double [[TMP2]] +// TR-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]] +// TR-NEXT: ret double [[TMP5]] // TR: [[TRAP]]: // TR-NEXT: call void @llvm.ubsantrap(i8 3) #[[ATTR5]], !nosanitize [[META2]] // TR-NEXT: unreachable, !nosanitize [[META2]] @@ -208,15 +212,17 @@ void use(double*); // REC-NEXT: [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16 // REC-NEXT: call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR5:[0-9]+]] // REC-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 -// REC-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]] -// REC-NEXT: br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]] -// REC: [[BB1]]: +// REC-NEXT: [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]] +// REC-NEXT: [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]] +// REC-NEXT: [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]] +// REC-NEXT: br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]] +// REC: [[BB4]]: // REC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]] -// REC-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]] -// REC-NEXT: ret double [[TMP2]] +// REC-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]] +// REC-NEXT: ret double [[TMP5]] // REC: [[TRAP]]: // REC-NEXT: call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]], !nosanitize [[META2]] -// REC-NEXT: br label %[[BB1]], !nosanitize [[META2]] +// REC-NEXT: br label %[[BB4]], !nosanitize [[META2]] // double lbounds(int b, int i) { double a[b]; diff --git a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl new file mode 100644 index 0000000000000..aa13b27581850 --- /dev/null +++ b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl @@ -0,0 +1,48 @@ +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s + +// CHECK: define {{.*}} i32 {{.*}}test_branch{{.*}}(i32 {{.*}} [[VALD:%.*]]) +// CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4 +// CHECK: [[CMP:%.*]] = icmp sgt i32 [[PARAM]], 0 +// CHECK: br i1 [[CMP]], label %if.then, label %if.else, !hlsl.controlflow.hint [[HINT_BRANCH:![0-9]+]] +export int test_branch(int X){ + int resp; + [branch] if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +// CHECK: define {{.*}} i32 {{.*}}test_flatten{{.*}}(i32 {{.*}} [[VALD:%.*]]) +// CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4 +// CHECK: [[CMP:%.*]] = icmp sgt i32 [[PARAM]], 0 +// CHECK: br i1 [[CMP]], label %if.then, label %if.else, !hlsl.controlflow.hint [[HINT_FLATTEN:![0-9]+]] +export int test_flatten(int X){ + int resp; + [flatten] if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +// CHECK: define {{.*}} i32 {{.*}}test_no_attr{{.*}}(i32 {{.*}} [[VALD:%.*]]) +// CHECK-NOT: !hlsl.controlflow.hint +export int test_no_attr(int X){ + int resp; + if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +//CHECK: [[HINT_BRANCH]] = !{!"hlsl.controlflow.hint", i32 1} +//CHECK: [[HINT_FLATTEN]] = !{!"hlsl.controlflow.hint", i32 2} diff --git a/clang/test/Driver/aarch64-implied-sme-features.c b/clang/test/Driver/aarch64-implied-sme-features.c index 4d507c0e99dd9..23ec27ff1aaff 100644 --- a/clang/test/Driver/aarch64-implied-sme-features.c +++ b/clang/test/Driver/aarch64-implied-sme-features.c @@ -51,4 +51,7 @@ // SME-SUBFEATURE-CONFLICT-REV: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme" "-target-feature" "+sme-i16i64" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+ssve-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE-AES -// SVE-AES: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-aes" "-target-feature" "+sve-aes" \ No newline at end of file +// SVE-AES: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-aes" "-target-feature" "+sve-aes" + ++// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+ssve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM ++// SVE-BITPERM: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-bitperm" "-target-feature" "+sve-bitperm" diff --git a/clang/test/Driver/aarch64-implied-sve-features.c b/clang/test/Driver/aarch64-implied-sve-features.c index e5f1e55345414..ecc1e9500b667 100644 --- a/clang/test/Driver/aarch64-implied-sve-features.c +++ b/clang/test/Driver/aarch64-implied-sve-features.c @@ -23,17 +23,24 @@ // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve+sve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-SVE2 // SVE-SVE2: "-target-feature" "+sve" "-target-feature" "+sve2" +// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM +// SVE-BITPERM: "-target-feature" "+sve-bitperm" + // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-BITPERM -// SVE2-BITPERM: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm" +// SVE2-BITPERM: "-target-feature" "+sve" "-target-feature" "+sve-bitperm" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+nosve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=NOSVE2-BITPERM +// NOSVE2-BITPERM-NOT: "-target-feature" "+sve-bitperm" // NOSVE2-BITPERM-NOT: "-target-feature" "+sve2-bitperm" // NOSVE2-BITPERM-NOT: "-target-feature" "+sve2" // NOSVE2-BITPERM-NOT: "-target-feature" "+sve" // NOSVE2-BITPERM-NOT: sve2-bitperm" +// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve-bitperm+nosve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM-REVERT +// SVE-BITPERM-REVERT: "-target-feature" "-sve-bitperm" + // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm+nosve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-BITPERM-REVERT -// SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-bitperm" +// SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "-sve-bitperm" "-target-feature" "-sve2" "-target-feature" "-sve2-bitperm" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES-REVERT // SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "-sve-aes" "-target-feature" "+sve2" "-target-feature" "-sve2-aes" @@ -57,7 +64,7 @@ // SVE2-SM4: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sm4" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SUBFEATURE-MIX -// SVE2-SUBFEATURE-MIX: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm" +// SVE2-SUBFEATURE-MIX: "-target-feature" "+sve" "-target-feature" "+sve-bitperm" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm" // SVE2-SUBFEATURE-NOT: sve2-aes // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sm4+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SUBFEATURE-CONFLICT @@ -72,6 +79,7 @@ // SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "+sve2-aes" // RUN: %clang --target=aarch64-linux-gnu -mcpu=neoverse-n2+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-MCPU-FEATURES +// SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve-bitperm" // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2-bitperm" // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2" // SVE-MCPU-FEATURES: "-target-feature" "+sve" diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp index 3ba2709ad95cc..8d17fe1549e34 100644 --- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp +++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp @@ -18,6 +18,28 @@ // RUN: --target=x86_64-linux-gnu 2>&1 \ // RUN: | FileCheck libcxx.cpp +// for macos there is a different directory structure +// where the library and libc++.modules.json file are in lib +// directly but headers are in clang/ver directory which +// is the resource directory +// RUN: mkdir -p %t/Inputs/usr/lib/clang/20 +// RUN: touch %t/Inputs/usr/lib/libc++.so +// RUN: touch %t/Inputs/usr/lib/libc++.modules.json +// RUN: %clang -print-library-module-manifest-path \ +// RUN: -stdlib=libc++ \ +// RUN: -resource-dir=%t/Inputs/usr/lib/clang/20 \ +// RUN: --target=arm64-apple-darwin24.1.0 2>&1 \ +// RUN: | FileCheck libcxx.cpp.macos + +// RUN: rm %t/Inputs/usr/lib/libc++.so +// RUN: touch %t/Inputs/usr/lib/libc++.a +// RUN: touch %t/Inputs/usr/lib/libc++.modules.json +// RUN: %clang -print-library-module-manifest-path \ +// RUN: -stdlib=libc++ \ +// RUN: -resource-dir=%t/Inputs/usr/lib/clang/20 \ +// RUN: --target=arm64-apple-darwin24.1.0 2>&1 \ +// RUN: | FileCheck libcxx.cpp.macos + // RUN: rm %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so // RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.a // RUN: %clang -print-library-module-manifest-path \ @@ -40,6 +62,10 @@ // CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}libc++.modules.json +//--- libcxx.cpp.macos + +// CHECK: {{.*}}libc++.modules.json + //--- libcxx-no-shared-lib.cpp // Note this might find a different path depending whether search path diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c index 3c74e3620df03..01a97a00de542 100644 --- a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c +++ b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c @@ -28,8 +28,6 @@ // CHECK-NEXT: FEAT_FP16 Enable half-precision floating-point data processing // CHECK-NEXT: FEAT_FP8 Enable FP8 instructions // CHECK-NEXT: FEAT_FP8DOT2 Enable FP8 2-way dot instructions -// CHECK-NEXT: FEAT_FP8DOT4 Enable FP8 4-way dot instructions -// CHECK-NEXT: FEAT_FP8FMA Enable Armv9.5-A FP8 multiply-add instructions // CHECK-NEXT: FEAT_FPAC Enable Armv8.3-A Pointer Authentication Faulting enhancement // CHECK-NEXT: FEAT_FRINTTS Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int // CHECK-NEXT: FEAT_FlagM Enable Armv8.4-A Flag Manipulation instructions diff --git a/clang/test/Driver/print-multi-selection-flags.c b/clang/test/Driver/print-multi-selection-flags.c index 5bf6dca5096a7..cf9522aa06852 100644 --- a/clang/test/Driver/print-multi-selection-flags.c +++ b/clang/test/Driver/print-multi-selection-flags.c @@ -90,3 +90,10 @@ // CHECK-RV32E-ORDER: --target=riscv32-unknown-none-elf // CHECK-RV32E-ORDER: -mabi=ilp32e // CHECK-RV32E-ORDER: -march=rv32e{{[0-9]+p[0-9]+}}_c{{[0-9]+p[0-9]+}}_zicsr{{[0-9]+p[0-9]+}} + +// RUN: %clang -print-multi-flags-experimental --target=armv8m.main-none-eabi -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-ARM-MULTILIB-CUSTOM-FLAG %s +// RUN: %clang -print-multi-flags-experimental --target=aarch64-none-eabi -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-AARCH64-MULTILIB-CUSTOM-FLAG %s +// CHECK-ARM-MULTILIB-CUSTOM-FLAG: --target=thumbv8m.main-unknown-none-eabi +// CHECK-AARCH64-MULTILIB-CUSTOM-FLAG: --target=aarch64-unknown-none-eabi +// CHECK-MULTILIB-CUSTOM-FLAG-DAG: -fmultilib-flag=foo +// CHECK-MULTILIB-CUSTOM-FLAG-DAG: -fmultilib-flag=bar diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c index 09d499548aa56..75aa1a3aeecdd 100644 --- a/clang/test/Driver/print-supported-extensions-aarch64.c +++ b/clang/test/Driver/print-supported-extensions-aarch64.c @@ -78,6 +78,7 @@ // CHECK-NEXT: predres2 FEAT_SPECRES2 Enable Speculation Restriction Instruction // CHECK-NEXT: ssbs FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit // CHECK-NEXT: ssve-aes FEAT_SSVE_AES Enable Armv9.6-A SVE AES support in streaming SVE mode +// CHECK-NEXT: ssve-bitperm FEAT_SSVE_BitPerm Enable Armv9.6-A SVE BitPerm support in streaming SVE mode // CHECK-NEXT: ssve-fp8dot2 FEAT_SSVE_FP8DOT2 Enable SVE2 FP8 2-way dot product instructions // CHECK-NEXT: ssve-fp8dot4 FEAT_SSVE_FP8DOT4 Enable SVE2 FP8 4-way dot product instructions // CHECK-NEXT: ssve-fp8fma FEAT_SSVE_FP8FMA Enable SVE2 FP8 multiply-add instructions @@ -86,10 +87,11 @@ // CHECK-NEXT: sve-aes2 FEAT_SVE_AES2 Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions // CHECK-NEXT: sve-b16b16 FEAT_SVE_B16B16 Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions // CHECK-NEXT: sve-bfscale FEAT_SVE_BFSCALE Enable Armv9.6-A SVE BFloat16 scaling instructions +// CHECK-NEXT: sve-bitperm FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions // CHECK-NEXT: sve-f16f32mm FEAT_SVE_F16F32MM Enable Armv9.6-A FP16 to FP32 Matrix Multiply // CHECK-NEXT: sve2 FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions // CHECK-NEXT: sve2-aes Shorthand for +sve2+sve-aes -// CHECK-NEXT: sve2-bitperm FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions +// CHECK-NEXT: sve2-bitperm Shorthand for +sve2+sve-bitperm // CHECK-NEXT: sve2-sha3 FEAT_SVE_SHA3 Enable SHA3 SVE2 instructions // CHECK-NEXT: sve2-sm4 FEAT_SVE_SM4 Enable SM4 SVE2 instructions // CHECK-NEXT: sve2p1 FEAT_SVE2p1 Enable Scalable Vector Extension 2.1 instructions diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index a8d9fcd8569cf..b28e0a07dad24 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -196,6 +196,7 @@ // CHECK-NEXT: xqcicm 0.2 'Xqcicm' (Qualcomm uC Conditional Move Extension) // CHECK-NEXT: xqcics 0.2 'Xqcics' (Qualcomm uC Conditional Select Extension) // CHECK-NEXT: xqcicsr 0.2 'Xqcicsr' (Qualcomm uC CSR Extension) +// CHECK-NEXT: xqciint 0.2 'Xqciint' (Qualcomm uC Interrupts Extension) // CHECK-NEXT: xqcilsm 0.2 'Xqcilsm' (Qualcomm uC Load Store Multiple Extension) // CHECK-NEXT: xqcisls 0.2 'Xqcisls' (Qualcomm uC Scaled Load Store Extension) // CHECK-EMPTY: diff --git a/clang/test/Frontend/dependency-gen-symlink.c b/clang/test/Frontend/dependency-gen-symlink.c index 15664a46b90c8..2fa339ad2abf2 100644 --- a/clang/test/Frontend/dependency-gen-symlink.c +++ b/clang/test/Frontend/dependency-gen-symlink.c @@ -15,7 +15,7 @@ // CHECK: dependency-gen-symlink.c.o // CHECK: dependency-gen-symlink.c // CHECK: a/header.h -// CHECK-NOT: b/header.h +// CHECK: b/header.h // CHECK-NOT: with-header-guard.h #include "a/header.h" #include "b/header.h" diff --git a/clang/test/Frontend/dependency-gen-windows-duplicates.c b/clang/test/Frontend/dependency-gen-windows-duplicates.c index 0ecc23226fb9c..abd351377dc33 100644 --- a/clang/test/Frontend/dependency-gen-windows-duplicates.c +++ b/clang/test/Frontend/dependency-gen-windows-duplicates.c @@ -9,7 +9,7 @@ // RUN: %clang -MD -MF - %t.dir/test.c -fsyntax-only -I %t.dir/subdir | FileCheck %s // CHECK: test.o: // CHECK-NEXT: \test.c -// CHECK-NEXT: \subdir\x.h +// CHECK-NEXT: \SubDir\X.h // File x.h must appear only once (case insensitive check). // CHECK-NOT: {{\\|/}}{{x|X}}.{{h|H}} diff --git a/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp b/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp deleted file mode 100644 index 1bf927ebb2eb7..0000000000000 --- a/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp +++ /dev/null @@ -1,255 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix CHECK-TLS %s - -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// expected-no-diagnostics - -#ifndef HEADER -#define HEADER - -enum omp_allocator_handle_t { - omp_null_allocator = 0, - omp_default_mem_alloc = 1, - omp_large_cap_mem_alloc = 2, - omp_const_mem_alloc = 3, - omp_high_bw_mem_alloc = 4, - omp_low_lat_mem_alloc = 5, - omp_cgroup_mem_alloc = 6, - omp_pteam_mem_alloc = 7, - omp_thread_mem_alloc = 8, - KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__ -}; - -template -struct ST { - static T m; -}; - -template T foo() { - T v; - #pragma omp scope private(v) allocate(allocator(TY):v) - v = ST::m; - return v; -} - -namespace ns { -int a; -} - -int main() { - static int a; - static int temp; - #pragma omp scope private(ns::a) allocate(allocator(omp_pteam_mem_alloc):ns::a) - ns::a++; - - #pragma omp scope private(a) allocate(allocator(omp_thread_mem_alloc):a) - a = 2; - double b = 3; - #pragma omp scope private(temp) allocate(temp) - temp += foo(); - return temp+ns::a; -} - -extern template int ST::m; - -int b; - -void bar(int a, float &z) { - #pragma omp scope private(a,z) allocate(allocator(omp_default_mem_alloc):a,z) - a += b; -} -#endif -// CHECK-LABEL: define dso_local noundef i32 @main( -// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[B:%.*]] = alloca double, align 8 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) -// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr)) -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK-NEXT: store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr)) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]]) -// CHECK-NEXT: [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 8 to ptr)) -// CHECK-NEXT: store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4 -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr)) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: store double 3.000000e+00, ptr [[B]], align 8 -// CHECK-NEXT: [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null) -// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v() -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4 -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]] -// CHECK-NEXT: store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4 -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 -// CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[TMP4]] -// CHECK-NEXT: ret i32 [[ADD2]] -// -// -// CHECK-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v( -// CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[V:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[V1:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 -// CHECK-NEXT: store i32 [[TMP1]], ptr [[V1]], align 4 -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[V]], align 4 -// CHECK-NEXT: ret i32 [[TMP2]] -// -// -// CHECK-LABEL: define dso_local void @_Z3bariRf( -// CHECK-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR3]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 -// CHECK-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) -// CHECK-NEXT: [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) -// CHECK-NEXT: store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] -// CHECK-NEXT: store i32 [[ADD]], ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: ret void -// -// -// CHECK-TLS-LABEL: define dso_local noundef i32 @main( -// CHECK-TLS-SAME: ) #[[ATTR0:[0-9]+]] { -// CHECK-TLS-NEXT: [[ENTRY:.*:]] -// CHECK-TLS-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK-TLS-NEXT: [[B:%.*]] = alloca double, align 8 -// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) -// CHECK-TLS-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr)) -// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK-TLS-NEXT: store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr)) -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 8 to ptr)) -// CHECK-TLS-NEXT: store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr)) -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: store double 3.000000e+00, ptr [[B]], align 8 -// CHECK-TLS-NEXT: [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null) -// CHECK-TLS-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v() -// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]] -// CHECK-TLS-NEXT: store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null) -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: [[TMP3:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 -// CHECK-TLS-NEXT: [[TMP4:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 -// CHECK-TLS-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[TMP4]] -// CHECK-TLS-NEXT: ret i32 [[ADD2]] -// -// -// CHECK-TLS-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v( -// CHECK-TLS-SAME: ) #[[ATTR3:[0-9]+]] comdat { -// CHECK-TLS-NEXT: [[ENTRY:.*:]] -// CHECK-TLS-NEXT: [[V:%.*]] = alloca i32, align 4 -// CHECK-TLS-NEXT: [[V1:%.*]] = alloca i32, align 4 -// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 -// CHECK-TLS-NEXT: store i32 [[TMP1]], ptr [[V1]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[V]], align 4 -// CHECK-TLS-NEXT: ret i32 [[TMP2]] -// -// -// CHECK-TLS-LABEL: define dso_local void @_Z3bariRf( -// CHECK-TLS-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR3]] { -// CHECK-TLS-NEXT: [[ENTRY:.*:]] -// CHECK-TLS-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK-TLS-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-TLS-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-TLS-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK-TLS-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 -// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) -// CHECK-TLS-NEXT: [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) -// CHECK-TLS-NEXT: store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8 -// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 4 -// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] -// CHECK-TLS-NEXT: store i32 [[ADD]], ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: ret void -// -// -// SIMD-ONLY0-LABEL: define dso_local noundef i32 @main( -// SIMD-ONLY0-SAME: ) #[[ATTR0:[0-9]+]] { -// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] -// SIMD-ONLY0-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[A:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[A1:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[B:%.*]] = alloca double, align 8 -// SIMD-ONLY0-NEXT: [[TEMP:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 -// SIMD-ONLY0-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -// SIMD-ONLY0-NEXT: store i32 [[INC]], ptr [[A]], align 4 -// SIMD-ONLY0-NEXT: store i32 2, ptr [[A1]], align 4 -// SIMD-ONLY0-NEXT: store double 3.000000e+00, ptr [[B]], align 8 -// SIMD-ONLY0-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v() -// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[TEMP]], align 4 -// SIMD-ONLY0-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]] -// SIMD-ONLY0-NEXT: store i32 [[ADD]], ptr [[TEMP]], align 4 -// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 -// SIMD-ONLY0-NEXT: [[TMP3:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 -// SIMD-ONLY0-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] -// SIMD-ONLY0-NEXT: ret i32 [[ADD2]] -// -// -// SIMD-ONLY0-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v( -// SIMD-ONLY0-SAME: ) #[[ATTR1:[0-9]+]] comdat { -// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] -// SIMD-ONLY0-NEXT: [[V:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[V1:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 -// SIMD-ONLY0-NEXT: store i32 [[TMP0]], ptr [[V1]], align 4 -// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[V]], align 4 -// SIMD-ONLY0-NEXT: ret i32 [[TMP1]] -// -// -// SIMD-ONLY0-LABEL: define dso_local void @_Z3bariRf( -// SIMD-ONLY0-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR1]] { -// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] -// SIMD-ONLY0-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 -// SIMD-ONLY0-NEXT: [[A1:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[Z2:%.*]] = alloca float, align 4 -// SIMD-ONLY0-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// SIMD-ONLY0-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// SIMD-ONLY0-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 -// SIMD-ONLY0-NEXT: store ptr [[Z2]], ptr [[TMP]], align 8 -// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr @b, align 4 -// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[A1]], align 4 -// SIMD-ONLY0-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -// SIMD-ONLY0-NEXT: store i32 [[ADD]], ptr [[A1]], align 4 -// SIMD-ONLY0-NEXT: ret void -// diff --git a/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp b/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp deleted file mode 100644 index 160c4996c1219..0000000000000 --- a/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 %s - -typedef enum omp_allocator_handle_t { - omp_null_allocator = 0, - omp_default_mem_alloc = 1, - omp_large_cap_mem_alloc = 2, - omp_const_mem_alloc = 3, - omp_high_bw_mem_alloc = 4, - omp_low_lat_mem_alloc = 5, - omp_cgroup_mem_alloc = 6, - omp_pteam_mem_alloc = 7, - omp_thread_mem_alloc = 8, -} omp_allocator_handle_t; - -int myAlloc() { - return 100; -} - -int main() { - int a, b, c; - // expected-error@+4 {{expected '('}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator - // expected-error@+6 {{expected expression}} - // expected-error@+5 {{expected ')'}} - // expected-note@+4 {{to match this '('}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator( - // expected-error@+4 {{expected expression}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator() - // expected-error@+2 {{expected expression}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator()) - // expected-error@+6 {{expected ')'}} - // expected-note@+5 {{to match this '('}} - // expected-error@+4 {{missing ':' after allocator modifier}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc - // expected-error@+6 {{missing ':' after allocator modifier}} - // expected-error@+5 {{expected expression}} - // expected-error@+4 {{expected ')'}} - // expected-note@+3 {{to match this '('}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator(omp_large_cap_mem_alloc: - // expected-error@+4 {{missing ':' after allocator modifier}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc) - // expected-error@+2 {{missing ':' after allocator modifier}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_high_bw_mem_alloc)) - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_low_lat_mem_alloc):) - // expected-error@+6 {{expected ')'}} - // expected-note@+5 {{to match this '('}} - // expected-error@+4 {{missing ':' after allocator modifier}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator(omp_cgroup_mem_alloc:) - // expected-error@+4 {{expected ')'}} - // expected-note@+3 {{to match this '('}} - // expected-error@+2 {{missing ':' after allocator modifier}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_pteam_mem_alloc:)) - // expected-error@+4 {{expected ')'}} - // expected-note@+3 {{to match this '('}} - // expected-error@+2 {{missing ':' after allocator modifier}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_thread_mem_alloc:c)) - // expected-error@+1 {{expected variable name}} - #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):1) - // expected-error@+1 {{expected variable name}} - #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):-10) - // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}} - // expected-error@+3 {{expected ')'}} - // expected-warning@+2 {{extra tokens at the end of '#pragma omp scope' are ignored}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(a,b,c) allocate(allocator(omp_const_mem_alloc):c:b;a) - // expected-error@+1 {{initializing 'const omp_allocator_handle_t' with an expression of incompatible type 'int'}} - #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c) - // expected-error@+2 {{missing ':' after allocator modifier}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc);c) - ++a; -} diff --git a/clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp b/clang/test/OpenMP/allocate_modifiers_ast_print.cpp similarity index 51% rename from clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp rename to clang/test/OpenMP/allocate_modifiers_ast_print.cpp index 15f3f1dd9bbb9..436647be75da3 100644 --- a/clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp +++ b/clang/test/OpenMP/allocate_modifiers_ast_print.cpp @@ -41,6 +41,11 @@ int main() { #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c) c++; #pragma omp scope private(c,a,b,d) allocate(myAlloc():a,b,c,d) + a++; + #pragma omp scope private(a,b) allocate(align(2), allocator(omp_const_mem_alloc):a,b) + b++; + #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()), align(8) :a,b,c) + c++; // DUMP: FunctionDecl {{.*}} // DUMP: DeclRefExpr {{.*}}'omp_allocator_handle_t' EnumConstant {{.*}}'omp_large_cap_mem_alloc' 'omp_allocator_handle_t' // DUMP: FunctionDecl {{.*}} @@ -76,11 +81,81 @@ int main() { // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int' // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'d' 'int' +// DUMP: OMPScopeDirective {{.*}} +// DUMP: OMPPrivateClause {{.*}} +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' +// DUMP: OMPAllocateClause {{.*}} +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' +// DUMP: OMPScopeDirective {{.*}} +// DUMP: OMPPrivateClause {{.*}} +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' +// DUMP: OMPAllocateClause {{.*}} +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int' // PRINT: #pragma omp scope private(a) allocate(omp_const_mem_alloc: a) // PRINT: #pragma omp scope private(a,b) allocate(allocator(omp_const_mem_alloc): a,b) // PRINT: #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()): a,b,c) // PRINT: #pragma omp scope private(c,a,b,d) allocate(myAlloc(): a,b,c,d) - d++; +// PRINT: #pragma omp scope private(a,b) allocate(align(2), allocator(omp_const_mem_alloc): a,b) +// PRINT: #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()), align(8): a,b,c) return a+b+c+d; } + +template +void templated_func(T n) { + int a, b; + T mem = n; + #pragma omp scope private(mem,a,b) allocate(allocator(n),align(al):mem,a,b) + a += b; + #pragma omp scope allocate(allocator(n),align(al):mem,a,b) private(mem,a,b) + a += b; +} + +void template_inst(int n) { + templated_func(omp_const_mem_alloc); + return; +} +// DUMP: FunctionTemplateDecl{{.*}}templated_func +// DUMP: FunctionDecl{{.*}}templated_func 'void (T)' +// DUMP: OMPScopeDirective +// DUMP: OMPPrivateClause +// DUMP: OMPAllocateClause +// DUMP: DeclRefExpr{{.*}}'T' lvalue Var{{.*}}'mem' 'T' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int' +// DUMP: OMPScopeDirective +// DUMP: OMPAllocateClause +// DUMP: DeclRefExpr{{.*}}'T' lvalue Var{{.*}}'mem' 'T' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int' +// DUMP: OMPPrivateClause + +// DUMP: FunctionDecl{{.*}}used templated_func 'void (omp_allocator_handle_t)' implicit_instantiation +// DUMP: TemplateArgument type 'omp_allocator_handle_t' +// DUMP: EnumType{{.*}}'omp_allocator_handle_t' +// DUMP: Enum{{.*}}'omp_allocator_handle_t' +// DUMP: TemplateArgument integral '4U' + +// DUMP: OMPScopeDirective +// DUMP: OMPPrivateClause +// DUMP: OMPAllocateClause +// DUMP: DeclRefExpr{{.*}}'omp_allocator_handle_t' lvalue Var{{.*}}'mem' 'omp_allocator_handle_t' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int' +// DUMP: OMPScopeDirective +// DUMP: OMPAllocateClause +// DUMP: DeclRefExpr{{.*}}'omp_allocator_handle_t' lvalue Var{{.*}}'mem' 'omp_allocator_handle_t' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int' +// DUMP: OMPPrivateClause +// PRINT: #pragma omp scope private(mem,a,b) allocate(allocator(n), align(al): mem,a,b) +// PRINT: #pragma omp scope allocate(allocator(n), align(al): mem,a,b) private(mem,a,b) +// PRINT: #pragma omp scope private(mem,a,b) allocate(allocator(n), align(4U): mem,a,b) +// PRINT: #pragma omp scope allocate(allocator(n), align(4U): mem,a,b) private(mem,a,b) + #endif diff --git a/clang/test/OpenMP/allocate_modifiers_codegen.cpp b/clang/test/OpenMP/allocate_modifiers_codegen.cpp new file mode 100644 index 0000000000000..d798e9b3435f0 --- /dev/null +++ b/clang/test/OpenMP/allocate_modifiers_codegen.cpp @@ -0,0 +1,409 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix CHECK-TLS %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, + KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__ +}; + +template +struct ST { + static T m; +}; + +template T foo() { + T v; + #pragma omp scope private(v) allocate(allocator(TY):v) + v = ST::m; + #pragma omp scope private(v) allocate(align(al), allocator(TY):v) + ++v; + return v; +} + +namespace ns { +int a; +} + +omp_allocator_handle_t foo(); + +int main() { + static int a; + static int temp; + int v; + #pragma omp scope private(ns::a) allocate(allocator(omp_pteam_mem_alloc):ns::a) + ns::a++; + #pragma omp scope private(a) allocate(align(8),allocator(omp_thread_mem_alloc):a) + a = 2; + #pragma omp scope private(v) allocate(align(1) : v) + ++v; + #pragma omp scope private(v) allocate(allocator(omp_default_mem_alloc) : v) + ++v; + #pragma omp scope private(v) allocate(allocator(omp_large_cap_mem_alloc), align(8) : v) + ++v; + #pragma omp scope private(v) allocate(align(4) : v) + ++v; + #pragma omp scope private(v) allocate(align(2), allocator(omp_default_mem_alloc) : v) + ++v; + #pragma omp scope private(v) allocate(align(8), allocator(foo()) : v) + ++v; + + double b = 3; + #pragma omp scope private(temp) allocate(temp) + temp += foo(); + return temp+ns::a; +} + +extern template int ST::m; + +const int b = 8; + +void bar(int a, float &z) { + #pragma omp scope private(a,z) allocate(align(b), allocator(omp_default_mem_alloc) : a,z) + a += b + z; +} +#endif +// CHECK-LABEL: define dso_local noundef i32 @main( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B:%.*]] = alloca double, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr)) +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 8 to ptr)) +// CHECK-NEXT: store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-NEXT: store i32 [[INC2]], ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr null) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR3:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTV__VOID_ADDR3]], align 4 +// CHECK-NEXT: [[INC4:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK-NEXT: store i32 [[INC4]], ptr [[DOTV__VOID_ADDR3]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR3]], ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR5:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 2 to ptr)) +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTV__VOID_ADDR5]], align 4 +// CHECK-NEXT: [[INC6:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC6]], ptr [[DOTV__VOID_ADDR5]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR5]], ptr inttoptr (i64 2 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR7:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null) +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTV__VOID_ADDR7]], align 4 +// CHECK-NEXT: [[INC8:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-NEXT: store i32 [[INC8]], ptr [[DOTV__VOID_ADDR7]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR7]], ptr null) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR9:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTV__VOID_ADDR9]], align 4 +// CHECK-NEXT: [[INC10:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK-NEXT: store i32 [[INC10]], ptr [[DOTV__VOID_ADDR9]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR9]], ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[CALL:%.*]] = call noundef i64 @_Z3foov() +// CHECK-NEXT: [[CONV:%.*]] = inttoptr i64 [[CALL]] to ptr +// CHECK-NEXT: [[DOTV__VOID_ADDR11:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr [[CONV]]) +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTV__VOID_ADDR11]], align 4 +// CHECK-NEXT: [[INC12:%.*]] = add nsw i32 [[TMP7]], 1 +// CHECK-NEXT: store i32 [[INC12]], ptr [[DOTV__VOID_ADDR11]], align 4 +// CHECK-NEXT: [[CALL13:%.*]] = call noundef i64 @_Z3foov() +// CHECK-NEXT: [[CONV14:%.*]] = inttoptr i64 [[CALL13]] to ptr +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR11]], ptr [[CONV14]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: store double 3.000000e+00, ptr [[B]], align 8 +// CHECK-NEXT: [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null) +// CHECK-NEXT: [[CALL15:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v() +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[CALL15]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 +// CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP9]], [[TMP10]] +// CHECK-NEXT: ret i32 [[ADD16]] +// +// +// CHECK-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v( +// CHECK-SAME: ) #[[ATTR4:[0-9]+]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[V:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 6 to ptr)) +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr inttoptr (i64 6 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 6 to ptr)) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR1]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[DOTV__VOID_ADDR1]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR1]], ptr inttoptr (i64 6 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[V]], align 4 +// CHECK-NEXT: ret i32 [[TMP3]] +// +// +// CHECK-LABEL: define dso_local void @_Z3bariRf( +// CHECK-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR4]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 +// CHECK-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP2]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP3]] to float +// CHECK-NEXT: [[ADD1:%.*]] = fadd float [[CONV]], [[ADD]] +// CHECK-NEXT: [[CONV2:%.*]] = fptosi float [[ADD1]] to i32 +// CHECK-NEXT: store i32 [[CONV2]], ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-TLS-LABEL: define dso_local noundef i32 @main( +// CHECK-TLS-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-TLS-NEXT: [[ENTRY:.*:]] +// CHECK-TLS-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-TLS-NEXT: [[V:%.*]] = alloca i32, align 4 +// CHECK-TLS-NEXT: [[B:%.*]] = alloca double, align 8 +// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) +// CHECK-TLS-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr)) +// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-TLS-NEXT: store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 8 to ptr)) +// CHECK-TLS-NEXT: store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null) +// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-TLS-NEXT: store i32 [[INC2]], ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr null) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR3:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTV__VOID_ADDR3]], align 4 +// CHECK-TLS-NEXT: [[INC4:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK-TLS-NEXT: store i32 [[INC4]], ptr [[DOTV__VOID_ADDR3]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR3]], ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR5:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 2 to ptr)) +// CHECK-TLS-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTV__VOID_ADDR5]], align 4 +// CHECK-TLS-NEXT: [[INC6:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-TLS-NEXT: store i32 [[INC6]], ptr [[DOTV__VOID_ADDR5]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR5]], ptr inttoptr (i64 2 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR7:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null) +// CHECK-TLS-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTV__VOID_ADDR7]], align 4 +// CHECK-TLS-NEXT: [[INC8:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-TLS-NEXT: store i32 [[INC8]], ptr [[DOTV__VOID_ADDR7]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR7]], ptr null) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR9:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTV__VOID_ADDR9]], align 4 +// CHECK-TLS-NEXT: [[INC10:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK-TLS-NEXT: store i32 [[INC10]], ptr [[DOTV__VOID_ADDR9]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR9]], ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[CALL:%.*]] = call noundef i64 @_Z3foov() +// CHECK-TLS-NEXT: [[CONV:%.*]] = inttoptr i64 [[CALL]] to ptr +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR11:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr [[CONV]]) +// CHECK-TLS-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTV__VOID_ADDR11]], align 4 +// CHECK-TLS-NEXT: [[INC12:%.*]] = add nsw i32 [[TMP7]], 1 +// CHECK-TLS-NEXT: store i32 [[INC12]], ptr [[DOTV__VOID_ADDR11]], align 4 +// CHECK-TLS-NEXT: [[CALL13:%.*]] = call noundef i64 @_Z3foov() +// CHECK-TLS-NEXT: [[CONV14:%.*]] = inttoptr i64 [[CALL13]] to ptr +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR11]], ptr [[CONV14]]) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: store double 3.000000e+00, ptr [[B]], align 8 +// CHECK-TLS-NEXT: [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null) +// CHECK-TLS-NEXT: [[CALL15:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v() +// CHECK-TLS-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[CALL15]] +// CHECK-TLS-NEXT: store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[TMP9:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 +// CHECK-TLS-NEXT: [[TMP10:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 +// CHECK-TLS-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP9]], [[TMP10]] +// CHECK-TLS-NEXT: ret i32 [[ADD16]] +// +// +// CHECK-TLS-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v( +// CHECK-TLS-SAME: ) #[[ATTR4:[0-9]+]] comdat { +// CHECK-TLS-NEXT: [[ENTRY:.*:]] +// CHECK-TLS-NEXT: [[V:%.*]] = alloca i32, align 4 +// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 6 to ptr)) +// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 +// CHECK-TLS-NEXT: store i32 [[TMP1]], ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr inttoptr (i64 6 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 6 to ptr)) +// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR1]], align 4 +// CHECK-TLS-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-TLS-NEXT: store i32 [[INC]], ptr [[DOTV__VOID_ADDR1]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR1]], ptr inttoptr (i64 6 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[TMP3:%.*]] = load i32, ptr [[V]], align 4 +// CHECK-TLS-NEXT: ret i32 [[TMP3]] +// +// +// CHECK-TLS-LABEL: define dso_local void @_Z3bariRf( +// CHECK-TLS-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR4]] { +// CHECK-TLS-NEXT: [[ENTRY:.*:]] +// CHECK-TLS-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-TLS-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-TLS-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-TLS-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK-TLS-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 +// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8 +// CHECK-TLS-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-TLS-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4 +// CHECK-TLS-NEXT: [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP2]] +// CHECK-TLS-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP3]] to float +// CHECK-TLS-NEXT: [[ADD1:%.*]] = fadd float [[CONV]], [[ADD]] +// CHECK-TLS-NEXT: [[CONV2:%.*]] = fptosi float [[ADD1]] to i32 +// CHECK-TLS-NEXT: store i32 [[CONV2]], ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: ret void +// +// +// SIMD-ONLY0-LABEL: define dso_local noundef i32 @main( +// SIMD-ONLY0-SAME: ) #[[ATTR0:[0-9]+]] { +// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY0-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[A:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[A1:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V2:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V4:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V6:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V8:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V10:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V12:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[B:%.*]] = alloca double, align 8 +// SIMD-ONLY0-NEXT: [[TEMP:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// SIMD-ONLY0-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC]], ptr [[A]], align 4 +// SIMD-ONLY0-NEXT: store i32 2, ptr [[A1]], align 4 +// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[V2]], align 4 +// SIMD-ONLY0-NEXT: [[INC3:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC3]], ptr [[V2]], align 4 +// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr [[V4]], align 4 +// SIMD-ONLY0-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC5]], ptr [[V4]], align 4 +// SIMD-ONLY0-NEXT: [[TMP3:%.*]] = load i32, ptr [[V6]], align 4 +// SIMD-ONLY0-NEXT: [[INC7:%.*]] = add nsw i32 [[TMP3]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC7]], ptr [[V6]], align 4 +// SIMD-ONLY0-NEXT: [[TMP4:%.*]] = load i32, ptr [[V8]], align 4 +// SIMD-ONLY0-NEXT: [[INC9:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC9]], ptr [[V8]], align 4 +// SIMD-ONLY0-NEXT: [[TMP5:%.*]] = load i32, ptr [[V10]], align 4 +// SIMD-ONLY0-NEXT: [[INC11:%.*]] = add nsw i32 [[TMP5]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC11]], ptr [[V10]], align 4 +// SIMD-ONLY0-NEXT: [[TMP6:%.*]] = load i32, ptr [[V12]], align 4 +// SIMD-ONLY0-NEXT: [[INC13:%.*]] = add nsw i32 [[TMP6]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC13]], ptr [[V12]], align 4 +// SIMD-ONLY0-NEXT: store double 3.000000e+00, ptr [[B]], align 8 +// SIMD-ONLY0-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v() +// SIMD-ONLY0-NEXT: [[TMP7:%.*]] = load i32, ptr [[TEMP]], align 4 +// SIMD-ONLY0-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[CALL]] +// SIMD-ONLY0-NEXT: store i32 [[ADD]], ptr [[TEMP]], align 4 +// SIMD-ONLY0-NEXT: [[TMP8:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 +// SIMD-ONLY0-NEXT: [[TMP9:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 +// SIMD-ONLY0-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP8]], [[TMP9]] +// SIMD-ONLY0-NEXT: ret i32 [[ADD14]] +// +// +// SIMD-ONLY0-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v( +// SIMD-ONLY0-SAME: ) #[[ATTR1:[0-9]+]] comdat { +// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY0-NEXT: [[V:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V1:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V2:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 +// SIMD-ONLY0-NEXT: store i32 [[TMP0]], ptr [[V1]], align 4 +// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[V2]], align 4 +// SIMD-ONLY0-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC]], ptr [[V2]], align 4 +// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr [[V]], align 4 +// SIMD-ONLY0-NEXT: ret i32 [[TMP2]] +// +// +// SIMD-ONLY0-LABEL: define dso_local void @_Z3bariRf( +// SIMD-ONLY0-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR1]] { +// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY0-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 +// SIMD-ONLY0-NEXT: [[A1:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[Z2:%.*]] = alloca float, align 4 +// SIMD-ONLY0-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// SIMD-ONLY0-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY0-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 +// SIMD-ONLY0-NEXT: store ptr [[Z2]], ptr [[TMP]], align 8 +// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 +// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4 +// SIMD-ONLY0-NEXT: [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP1]] +// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr [[A1]], align 4 +// SIMD-ONLY0-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP2]] to float +// SIMD-ONLY0-NEXT: [[ADD3:%.*]] = fadd float [[CONV]], [[ADD]] +// SIMD-ONLY0-NEXT: [[CONV4:%.*]] = fptosi float [[ADD3]] to i32 +// SIMD-ONLY0-NEXT: store i32 [[CONV4]], ptr [[A1]], align 4 +// SIMD-ONLY0-NEXT: ret void +// diff --git a/clang/test/OpenMP/allocate_modifiers_messages.cpp b/clang/test/OpenMP/allocate_modifiers_messages.cpp new file mode 100644 index 0000000000000..6867e78a89ee9 --- /dev/null +++ b/clang/test/OpenMP/allocate_modifiers_messages.cpp @@ -0,0 +1,159 @@ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 %s + +typedef enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, +} omp_allocator_handle_t; + +int myAlloc() { + return 100; +} + +int main() { + int a, b, c; + // expected-error@+4 {{expected '('}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator + // expected-error@+6 {{expected expression}} + // expected-error@+5 {{expected ')'}} + // expected-note@+4 {{to match this '('}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator( + // expected-error@+4 {{expected expression}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator() + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator()) + // expected-error@+6 {{expected ')'}} + // expected-note@+5 {{to match this '('}} + // expected-error@+4 {{missing ':' after allocate clause modifier}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc + // expected-error@+6 {{missing ':' after allocate clause modifier}} + // expected-error@+5 {{expected expression}} + // expected-error@+4 {{expected ')'}} + // expected-note@+3 {{to match this '('}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator(omp_large_cap_mem_alloc: + // expected-error@+4 {{missing ':' after allocate clause modifier}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc) + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_high_bw_mem_alloc)) + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_low_lat_mem_alloc):) + // expected-error@+6 {{expected ')'}} + // expected-note@+5 {{to match this '('}} + // expected-error@+4 {{missing ':' after allocate clause modifier}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator(omp_cgroup_mem_alloc:) + // expected-error@+4 {{expected ')'}} + // expected-note@+3 {{to match this '('}} + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_pteam_mem_alloc:)) + // expected-error@+4 {{expected ')'}} + // expected-note@+3 {{to match this '('}} + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_thread_mem_alloc:c)) + // expected-error@+1 {{expected variable name}} + #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):1) + // expected-error@+1 {{expected variable name}} + #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):-10) + // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}} + // expected-error@+3 {{expected ')'}} + // expected-warning@+2 {{extra tokens at the end of '#pragma omp scope' are ignored}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a,b,c) allocate(allocator(omp_const_mem_alloc):c:b;a) + // expected-error@+1 {{initializing 'const omp_allocator_handle_t' with an expression of incompatible type 'int'}} + #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c) + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc);c) + // expected-error@+2 {{duplicate modifier 'allocator' in 'allocate' clause}} + // expected-warning@+1 {{aligned clause will be ignored because the requested alignment is not a power of 2}} + #pragma omp scope private(a) allocate(allocator(omp_default_mem_alloc), allocator(omp_default_mem_alloc), align(3) : a) + // expected-error@+4 {{expected '('}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a) allocate(allocator + // expected-error@+4 {{expected '('}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(b) allocate(align + // expected-error@+1 {{duplicate modifier 'align' in 'allocate' clause}} + #pragma omp scope private(a) allocate(align(8), align(4) : a) + // expected-error@+5 {{use of undeclared identifier 'align'}} + // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}} + // expected-error@+3 {{expected ')'}} + // expected-note@+2 {{to match this '('}} + // expected-error@+1 {{expected variable name}} + #pragma omp scope private(a) allocate(omp_default_mem_alloc, align(8) : a) + // expected-error@+3 {{expected modifier in 'allocate' clause}} + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(a) allocate(align(8), omp_default_mem_alloc : a) + // expected-error@+5 {{expected ',' or ')' in 'allocate' clause}} + // expected-error@+4 {{expected ')'}} + // expected-note@+3 {{to match this '('}} + // expected-error@+2 {{expected variable name}} + // expected-error@+1 {{expected variable name}} + #pragma omp scope private(a) allocate(omp_default_mem_alloc, omp_default_mem_alloc : a) + // expected-error@+2 {{use of undeclared identifier 'undefinedVar'}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(a) allocate(undefinedVar : a) + // expected-error@+1 {{expected expression}} + #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) : ) + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) ) + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) : + + // expected-error@+4 {{missing ':' after allocate clause modifier}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) + // expected-error@+4 {{expected '('}} + // expected-error@+3 {{expected '('}} + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{use of undeclared identifier 'allocator'}} + #pragma omp scope private(a) allocate(align, allocator : ) + // expected-error@+7 {{expected expression}} + // expected-error@+6 {{expected expression}} + // expected-error@+5 {{expected expression}} + // expected-error@+4 {{use of undeclared identifier 'allocator'}} + // expected-error@+3 {{expected ',' or ')' in 'allocate' clause}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a) allocate(align(), allocator() : ) + ++a; +} diff --git a/clang/test/OpenMP/masked_taskloop_codegen.c b/clang/test/OpenMP/masked_taskloop_codegen.c new file mode 100644 index 0000000000000..26f54c1797bbe --- /dev/null +++ b/clang/test/OpenMP/masked_taskloop_codegen.c @@ -0,0 +1,50 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 5 +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=52 -x c -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics +#define N 100 +void masked_taskloop(){ + #pragma omp masked taskloop + for( int i = 0; i < N; i++) + ; + +} + +int main() +{ + masked_taskloop(); +} +// CHECK-LABEL: define dso_local void @masked_taskloop( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_masked(ptr @[[GLOB1]], i32 [[TMP0]], i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[TMP2]], label %[[OMP_IF_THEN:.*]], label %[[OMP_IF_END:.*]] +// CHECK: [[OMP_IF_THEN]]: +// CHECK-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP3:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 80, i64 0, ptr @.omp_task_entry.) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 5 +// CHECK-NEXT: store i64 0, ptr [[TMP5]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 6 +// CHECK-NEXT: store i64 99, ptr [[TMP6]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 7 +// CHECK-NEXT: store i64 1, ptr [[TMP7]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 9 +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 0, i64 8, i1 false) +// CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_taskloop(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP3]], i32 1, ptr [[TMP5]], ptr [[TMP6]], i64 [[TMP9]], i32 1, i32 0, i64 0, ptr null) +// CHECK-NEXT: call void @__kmpc_end_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_end_masked(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: br label %[[OMP_IF_END]] +// CHECK: [[OMP_IF_END]]: +// CHECK-NEXT: ret void +// +// CHECK-LABEL: define dso_local i32 @main( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @masked_taskloop() +// CHECK-NEXT: ret i32 0 + diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index 86265f630296c..b10c55447d9af 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -246,7 +246,12 @@ // CHECK-SVE2SHA3: __ARM_FEATURE_SVE2_SHA3 1 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-sm4 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2SM4 %s // CHECK-SVE2SM4: __ARM_FEATURE_SVE2_SM4 1 -// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s +// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVEBITPERM %s +// CHECK-SVEBITPERM: __ARM_FEATURE_SVE2_BITPERM 1 + +// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s +// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sve-bitperm+sve2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s +// CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2 1 // CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2_BITPERM 1 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2p1 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p1 %s diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c index bea0b29bcc70a..fd5374d928ea9 100644 --- a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -fsyntax-only -verify %s // REQUIRES: aarch64-registered-target diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp index 93d4b00701693..985ea15ac2a4e 100644 --- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp +++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp @@ -26,61 +26,61 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64) // expected-error@+2 {{'svaesmc_u8' needs target feature sve,sve2,sve-aes}} // overload-error@+1 {{'svaesmc' needs target feature sve,sve2,sve-aes}} SVE_ACLE_FUNC(svaesmc,_u8,,)(svundef_u8()); - // expected-error@+2 {{'svbdep_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_u8,,)(svundef_u8(), svundef_u8()); - // expected-error@+2 {{'svbdep_n_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_n_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_n_u8,,)(svundef_u8(), u8); - // expected-error@+2 {{'svbext_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_u8,,)(svundef_u8(), svundef_u8()); - // expected-error@+2 {{'svbext_n_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_n_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_n_u8,,)(svundef_u8(), u8); - // expected-error@+2 {{'svbgrp_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_u8,,)(svundef_u8(), svundef_u8()); - // expected-error@+2 {{'svbgrp_n_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_n_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u8,,)(svundef_u8(), u8); - // expected-error@+2 {{'svbdep_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_u16,,)(svundef_u16(), svundef_u16()); - // expected-error@+2 {{'svbdep_n_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_n_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_n_u16,,)(svundef_u16(), u16); - // expected-error@+2 {{'svbext_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_u16,,)(svundef_u16(), svundef_u16()); - // expected-error@+2 {{'svbext_n_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_n_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_n_u16,,)(svundef_u16(), u16); - // expected-error@+2 {{'svbgrp_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_u16,,)(svundef_u16(), svundef_u16()); - // expected-error@+2 {{'svbgrp_n_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_n_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u16,,)(svundef_u16(), u16); - // expected-error@+2 {{'svbdep_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_u32,,)(svundef_u32(), svundef_u32()); - // expected-error@+2 {{'svbdep_n_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_n_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_n_u32,,)(svundef_u32(), u32); - // expected-error@+2 {{'svbext_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_u32,,)(svundef_u32(), svundef_u32()); - // expected-error@+2 {{'svbext_n_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_n_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_n_u32,,)(svundef_u32(), u32); - // expected-error@+2 {{'svbgrp_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_u32,,)(svundef_u32(), svundef_u32()); - // expected-error@+2 {{'svbgrp_n_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_n_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u32,,)(svundef_u32(), u32); // expected-error@+2 {{'svsm4e_u32' needs target feature sve,sve2-sm4}} // overload-error@+1 {{'svsm4e' needs target feature sve,sve2-sm4}} @@ -89,23 +89,23 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64) // overload-error@+1 {{'svsm4ekey' needs target feature sve,sve2-sm4}} SVE_ACLE_FUNC(svsm4ekey,_u32,,)(svundef_u32(), svundef_u32()); - // expected-error@+2 {{'svbdep_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_u64,,)(svundef_u64(), svundef_u64()); - // expected-error@+2 {{'svbdep_n_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_n_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_n_u64,,)(svundef_u64(), u64); - // expected-error@+2 {{'svbext_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_u64,,)(svundef_u64(), svundef_u64()); - // expected-error@+2 {{'svbext_n_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_n_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_n_u64,,)(svundef_u64(), u64); - // expected-error@+2 {{'svbgrp_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_u64,,)(svundef_u64(), svundef_u64()); - // expected-error@+2 {{'svbgrp_n_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_n_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u64,,)(svundef_u64(), u64); // expected-error@+2 {{'svpmullb_pair_u64' needs target feature sve,sve2,sve-aes}} // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2,sve-aes}} diff --git a/clang/test/VFS/external-names.c b/clang/test/VFS/external-names.c index dd0b5eb501840..5b7c443b36e56 100644 --- a/clang/test/VFS/external-names.c +++ b/clang/test/VFS/external-names.c @@ -47,4 +47,4 @@ // RUN: %clang_cc1 -D REINCLUDE -I %t -ivfsoverlay %t.yaml -Eonly %s -MTfoo -dependency-file %t.dep // RUN: cat %t.dep | FileCheck --check-prefix=CHECK-DEP %s -// CHECK-DEP: Inputs{{..?}}external-names.h +// CHECK-DEP-NOT: Inputs diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c index 018f992640065..58feddeb6bea0 100644 --- a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c +++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c @@ -18,7 +18,6 @@ // UTC_ARGS: --enable #ifdef __arm__ -/// FIXME: UTC does not find this function, but can find all others. typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t; int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return a + b + c; diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected index 5d514f9d64c02..e17ce61db9c2b 100644 --- a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected +++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected @@ -18,8 +18,22 @@ // UTC_ARGS: --enable #ifdef __arm__ -/// FIXME: UTC does not find this function, but can find all others. typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t; +// THUMB-DARWIN-LABEL: @test_vaba_s8( +// THUMB-DARWIN-NEXT: entry: +// THUMB-DARWIN-NEXT: [[A_ADDR:%.*]] = alloca <8 x i8>, align 8 +// THUMB-DARWIN-NEXT: [[B_ADDR:%.*]] = alloca <8 x i8>, align 8 +// THUMB-DARWIN-NEXT: [[C_ADDR:%.*]] = alloca <8 x i8>, align 8 +// THUMB-DARWIN-NEXT: store <8 x i8> [[A:%.*]], ptr [[A_ADDR]], align 8 +// THUMB-DARWIN-NEXT: store <8 x i8> [[B:%.*]], ptr [[B_ADDR]], align 8 +// THUMB-DARWIN-NEXT: store <8 x i8> [[C:%.*]], ptr [[C_ADDR]], align 8 +// THUMB-DARWIN-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8 +// THUMB-DARWIN-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8 +// THUMB-DARWIN-NEXT: [[ADD:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]] +// THUMB-DARWIN-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[C_ADDR]], align 8 +// THUMB-DARWIN-NEXT: [[ADD1:%.*]] = add <8 x i8> [[ADD]], [[TMP2]] +// THUMB-DARWIN-NEXT: ret <8 x i8> [[ADD1]] +// int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return a + b + c; } diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index f2bfde9bed372..791248e7a394f 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -6052,7 +6052,7 @@ TEST_P(ASTImporterLookupTableTest, EnumConstantDecl) { EXPECT_EQ(*Res.begin(), A); } -TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) { +TEST_P(ASTImporterLookupTableTest, LookupSearchesInActualNamespaceOnly) { TranslationUnitDecl *ToTU = getToTuDecl( R"( namespace N { @@ -6062,7 +6062,9 @@ TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) { } )", Lang_CXX03); - auto *N1 = + auto *N1 = FirstDeclMatcher().match( + ToTU, namespaceDecl(hasName("N"))); + auto *N2 = LastDeclMatcher().match(ToTU, namespaceDecl(hasName("N"))); auto *A = FirstDeclMatcher().match(ToTU, varDecl(hasName("A"))); DeclarationName Name = A->getDeclName(); @@ -6071,6 +6073,7 @@ TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) { auto Res = LT.lookup(N1, Name); ASSERT_EQ(Res.size(), 1u); EXPECT_EQ(*Res.begin(), A); + EXPECT_TRUE(LT.lookup(N2, Name).empty()); } TEST_P(ASTImporterOptionSpecificTestBase, @@ -10170,6 +10173,151 @@ TEST_P(ImportTemplateParmDeclDefaultValue, FromD, FromDInherited); } +TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch1) { + const char *ToCode = + R"( + namespace a { + } + namespace a { + struct X { int A; }; + } + )"; + getToTuDecl(ToCode, Lang_CXX11); + const char *Code = + R"( + namespace a { + struct X { char A; }; + } + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = FirstDeclMatcher().match( + FromTU, cxxRecordDecl(hasName("X"))); + auto *ImportedX = Import(FromX, Lang_CXX11); + EXPECT_FALSE(ImportedX); +} + +TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch2) { + const char *ToCode = + R"( + namespace a { + struct X { int A; }; + } + namespace a { + } + )"; + getToTuDecl(ToCode, Lang_CXX11); + const char *Code = + R"( + namespace a { + struct X { char A; }; + } + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = FirstDeclMatcher().match( + FromTU, cxxRecordDecl(hasName("X"))); + auto *ImportedX = Import(FromX, Lang_CXX11); + EXPECT_FALSE(ImportedX); +} + +TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceMatch1) { + const char *ToCode = + R"( + namespace a { + } + namespace a { + struct X { int A; }; + } + )"; + Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + const char *Code = + R"( + namespace a { + struct X { int A; }; + } + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = FirstDeclMatcher().match( + FromTU, cxxRecordDecl(hasName("X"))); + auto *ToX = FirstDeclMatcher().match( + ToTU, cxxRecordDecl(hasName("X"))); + auto *ImportedX = Import(FromX, Lang_CXX11); + EXPECT_EQ(ImportedX, ToX); +} + +TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceMatch2) { + const char *ToCode = + R"( + namespace a { + struct X { int A; }; + } + namespace a { + } + )"; + Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + const char *Code = + R"( + namespace a { + struct X { int A; }; + } + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = FirstDeclMatcher().match( + FromTU, cxxRecordDecl(hasName("X"))); + auto *ToX = FirstDeclMatcher().match( + ToTU, cxxRecordDecl(hasName("X"))); + auto *ImportedX = Import(FromX, Lang_CXX11); + EXPECT_EQ(ImportedX, ToX); +} + +TEST_P(ASTImporterLookupTableTest, PrimaryDCChangeAtImport) { + const char *ToCode = + R"( + template + struct X; + )"; + Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + auto *ToX = FirstDeclMatcher().match( + ToTU, classTemplateDecl(hasName("X"))); + NamedDecl *ToParm = ToX->getTemplateParameters()->getParam(0); + DeclContext *OldPrimaryDC = ToX->getTemplatedDecl()->getPrimaryContext(); + ASSERT_EQ(ToParm->getDeclContext(), ToX->getTemplatedDecl()); + ASSERT_EQ(SharedStatePtr->getLookupTable() + ->lookup(ToX->getTemplatedDecl(), ToParm->getDeclName()) + .size(), + 1u); + ASSERT_TRUE(SharedStatePtr->getLookupTable()->contains( + ToX->getTemplatedDecl(), ToParm)); + + const char *Code = + R"( + template + struct X; + template + struct X {}; + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = LastDeclMatcher().match( + FromTU, classTemplateDecl(hasName("X"))); + + auto *ImportedX = Import(FromX, Lang_CXX11); + + EXPECT_TRUE(ImportedX); + EXPECT_EQ(ImportedX->getTemplateParameters()->getParam(0)->getDeclContext(), + ImportedX->getTemplatedDecl()); + + // ToX did not change at the import. + // Verify that primary context has changed after import of class definition. + DeclContext *NewPrimaryDC = ToX->getTemplatedDecl()->getPrimaryContext(); + EXPECT_NE(OldPrimaryDC, NewPrimaryDC); + // The lookup table should not be different than it was before. + EXPECT_EQ(SharedStatePtr->getLookupTable() + ->lookup(ToX->getTemplatedDecl(), ToParm->getDeclName()) + .size(), + 1u); + EXPECT_TRUE(SharedStatePtr->getLookupTable()->contains( + ToX->getTemplatedDecl(), ToParm)); +} + TEST_P(ASTImporterOptionSpecificTestBase, ExistingUndeclaredImportDeclaredFriend) { Decl *ToTU = getToTuDecl( diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 97b768db3a313..35477cfc3cf45 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -295,7 +295,7 @@ class SVEEmitter { const char *Suffix; }; - static const std::array Reinterprets; + static const std::array Reinterprets; const RecordKeeper &Records; StringMap EltTypes; @@ -418,9 +418,10 @@ class SVEEmitter { SmallVectorImpl> &Out); }; -const std::array SVEEmitter::Reinterprets = +const std::array SVEEmitter::Reinterprets = {{{SVEType("c", 'd'), "s8"}, {SVEType("Uc", 'd'), "u8"}, + {SVEType("m", 'd'), "mf8"}, {SVEType("s", 'd'), "s16"}, {SVEType("Us", 'd'), "u16"}, {SVEType("i", 'd'), "s32"}, diff --git a/clang/www/analyzer/annotations.html b/clang/www/analyzer/annotations.html index bf0076e514278..b19d47bce2662 100644 --- a/clang/www/analyzer/annotations.html +++ b/clang/www/analyzer/annotations.html @@ -3,6 +3,8 @@ Source Annotations + + @@ -15,765 +17,11 @@

Source Annotations

+

This page is deprecated and will be removed in release 21.0

+

Its content was migrated to the regular LLVM documentation.

+ -

The Clang frontend supports several source-level annotations in the form of -GCC-style -attributes and pragmas that can help make using the Clang Static Analyzer -more useful. These annotations can both help suppress false positives as well as -enhance the analyzer's ability to find bugs.

- -

This page gives a practical overview of such annotations. For more technical -specifics regarding Clang-specific annotations please see the Clang's list of language -extensions. Details of "standard" GCC attributes (that Clang also -supports) can be found in the GCC -manual, with the majority of the relevant attributes being in the section on -function -attributes.

- -

Note that attributes that are labeled Clang-specific are not -recognized by GCC. Their use can be conditioned using preprocessor macros -(examples included on this page).

- -

Specific Topics

- - - - -

Annotations to Enhance Generic Checks

- - -

Null Pointer Checking

- -

Attribute 'nonnull'

- -

The analyzer recognizes the GCC attribute 'nonnull', which indicates that a -function expects that a given function parameter is not a null pointer. Specific -details of the syntax of using the 'nonnull' attribute can be found in GCC's -documentation.

- -

Both the Clang compiler and GCC will flag warnings for simple cases where a -null pointer is directly being passed to a function with a 'nonnull' parameter -(e.g., as a constant). The analyzer extends this checking by using its deeper -symbolic analysis to track what pointer values are potentially null and then -flag warnings when they are passed in a function call via a 'nonnull' -parameter.

- -

Example

- -
-$ cat test.m
-int bar(int*p, int q, int *r) __attribute__((nonnull(1,3)));
-
-int foo(int *p, int *q) {
-   return !p ? bar(q, 2, p)
-             : bar(p, 2, q);
-}
-
- -

Running scan-build over this source produces the following -output:

- -example attribute nonnull - - -

Mac OS X API Annotations

- - -

Cocoa & Core Foundation Memory Management -Annotations

- - -

The analyzer supports the proper management of retain counts for -both Cocoa and Core Foundation objects. This checking is largely based on -enforcing Cocoa and Core Foundation naming conventions for Objective-C methods -(Cocoa) and C functions (Core Foundation). Not strictly following these -conventions can cause the analyzer to miss bugs or flag false positives.

- -

One can educate the analyzer (and others who read your code) about methods or -functions that deviate from the Cocoa and Core Foundation conventions using the -attributes described here. However, you should consider using proper naming -conventions or the objc_method_family -attribute, if applicable.

- -

Attribute 'ns_returns_retained' -(Clang-specific)

- -

The GCC-style (Clang-specific) attribute 'ns_returns_retained' allows one to -annotate an Objective-C method or C function as returning a retained Cocoa -object that the caller is responsible for releasing (via sending a -release message to the object). The Foundation framework defines a -macro NS_RETURNS_RETAINED that is functionally equivalent to the -one shown below.

- -

Placing on Objective-C methods: For Objective-C methods, this -annotation essentially tells the analyzer to treat the method as if its name -begins with "alloc" or "new" or contains the word -"copy".

- -

Placing on C functions: For C functions returning Cocoa objects, the -analyzer typically does not make any assumptions about whether or not the object -is returned retained. Explicitly adding the 'ns_returns_retained' attribute to C -functions allows the analyzer to perform extra checking.

- -

Example

- -
-$ cat test.m
-#import <Foundation/Foundation.h>
-
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_RETURNS_RETAINED
-#if __has_feature(attribute_ns_returns_retained)
-#define NS_RETURNS_RETAINED __attribute__((ns_returns_retained))
-#else
-#define NS_RETURNS_RETAINED
-#endif
-#endif
-
-@interface MyClass : NSObject {}
-- (NSString*) returnsRetained NS_RETURNS_RETAINED;
-- (NSString*) alsoReturnsRetained;
-@end
-
-@implementation MyClass
-- (NSString*) returnsRetained {
-  return [[NSString alloc] initWithCString:"no leak here"];
-}
-- (NSString*) alsoReturnsRetained {
-  return [[NSString alloc] initWithCString:"flag a leak"];
-}
-@end
-
- -

Running scan-build on this source file produces the following output:

- -example returns retained - -

Attribute 'ns_returns_not_retained' -(Clang-specific)

- -

The 'ns_returns_not_retained' attribute is the complement of 'ns_returns_retained'. Where a function or -method may appear to obey the Cocoa conventions and return a retained Cocoa -object, this attribute can be used to indicate that the object reference -returned should not be considered as an "owning" reference being -returned to the caller. The Foundation framework defines a -macro NS_RETURNS_NOT_RETAINED that is functionally equivalent to -the one shown below.

- -

Usage is identical to ns_returns_retained. When using the -attribute, be sure to declare it within the proper macro that checks for -its availability, as it is not available in earlier versions of the analyzer:

- -
-$ cat test.m
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_RETURNS_NOT_RETAINED
-#if __has_feature(attribute_ns_returns_not_retained)
-#define NS_RETURNS_NOT_RETAINED __attribute__((ns_returns_not_retained))
-#else
-#define NS_RETURNS_NOT_RETAINED
-#endif
-#endif
-
- -

Attribute 'cf_returns_retained' -(Clang-specific)

- -

The GCC-style (Clang-specific) attribute 'cf_returns_retained' allows one to -annotate an Objective-C method or C function as returning a retained Core -Foundation object that the caller is responsible for releasing. The -CoreFoundation framework defines a macro CF_RETURNS_RETAINED -that is functionally equivalent to the one shown below.

- -

Placing on Objective-C methods: With respect to Objective-C methods., -this attribute is identical in its behavior and usage to 'ns_returns_retained' -except for the distinction of returning a Core Foundation object instead of a -Cocoa object. - -This distinction is important for the following reason: -as Core Foundation is a C API, -the analyzer cannot always tell that a pointer return value refers to a -Core Foundation object. -In contrast, it is -trivial for the analyzer to recognize if a pointer refers to a Cocoa object -(given the Objective-C type system). - -

Placing on C functions: When placing the attribute -'cf_returns_retained' on the declarations of C functions, the analyzer -interprets the function as:

- -
    -
  1. Returning a Core Foundation Object
  2. -
  3. Treating the function as if it its name -contained the keywords "create" or "copy". This means the -returned object as a +1 retain count that must be released by the caller, either -by sending a release message (via toll-free bridging to an Objective-C -object pointer), or calling CFRelease or a similar function.
  4. -
- -

Example

- -
-$ cat test.m
-$ cat test.m
-#import <Cocoa/Cocoa.h>
-
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_RETURNS_RETAINED
-#if __has_feature(attribute_cf_returns_retained)
-#define CF_RETURNS_RETAINED __attribute__((cf_returns_retained))
-#else
-#define CF_RETURNS_RETAINED
-#endif
-#endif
-
-@interface MyClass : NSObject {}
-- (NSDate*) returnsCFRetained CF_RETURNS_RETAINED;
-- (NSDate*) alsoReturnsRetained;
-- (NSDate*) returnsNSRetained NS_RETURNS_RETAINED;
-@end
-
-CF_RETURNS_RETAINED
-CFDateRef returnsRetainedCFDate()  {
-  return CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-}
-
-@implementation MyClass
-- (NSDate*) returnsCFRetained {
-  return (NSDate*) returnsRetainedCFDate(); // No leak.
-}
-
-- (NSDate*) alsoReturnsRetained {
-  return (NSDate*) returnsRetainedCFDate(); // Always report a leak.
-}
-
-- (NSDate*) returnsNSRetained {
-  return (NSDate*) returnsRetainedCFDate(); // Report a leak when using GC.
-}
-@end
-
- -

Running scan-build on this example produces the following output:

- -example returns retained - -

Attribute 'cf_returns_not_retained' -(Clang-specific)

- -

The 'cf_returns_not_retained' attribute is the complement of 'cf_returns_retained'. Where a function or -method may appear to obey the Core Foundation or Cocoa conventions and return -a retained Core Foundation object, this attribute can be used to indicate that -the object reference returned should not be considered as an -"owning" reference being returned to the caller. The -CoreFoundation framework defines a macro CF_RETURNS_NOT_RETAINED -that is functionally equivalent to the one shown below.

- -

Usage is identical to cf_returns_retained. When using the -attribute, be sure to declare it within the proper macro that checks for -its availability, as it is not available in earlier versions of the analyzer:

- -
-$ cat test.m
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_RETURNS_NOT_RETAINED
-#if __has_feature(attribute_cf_returns_not_retained)
-#define CF_RETURNS_NOT_RETAINED __attribute__((cf_returns_not_retained))
-#else
-#define CF_RETURNS_NOT_RETAINED
-#endif
-#endif
-
- -

Attribute 'ns_consumed' -(Clang-specific)

- -

The 'ns_consumed' attribute can be placed on a specific parameter in either -the declaration of a function or an Objective-C method. It indicates to the -static analyzer that a release message is implicitly sent to the -parameter upon completion of the call to the given function or method. The -Foundation framework defines a macro NS_RELEASES_ARGUMENT that -is functionally equivalent to the NS_CONSUMED macro shown below.

- -

Example

- -
-$ cat test.m
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_CONSUMED
-#if __has_feature(attribute_ns_consumed)
-#define NS_CONSUMED __attribute__((ns_consumed))
-#else
-#define NS_CONSUMED
-#endif
-#endif
-
-void consume_ns(id NS_CONSUMED x);
-
-void test() {
-  id x = [[NSObject alloc] init];
-  consume_ns(x); // No leak!
-}
-
-@interface Foo : NSObject
-+ (void) releaseArg:(id) NS_CONSUMED x;
-+ (void) releaseSecondArg:(id)x second:(id) NS_CONSUMED y;
-@end
-
-void test_method() {
-  id x = [[NSObject alloc] init];
-  [Foo releaseArg:x]; // No leak!
-}
-
-void test_method2() {
-  id a = [[NSObject alloc] init];
-  id b = [[NSObject alloc] init];
-  [Foo releaseSecondArg:a second:b]; // 'a' is leaked, but 'b' is released.
-}
-
- -

Attribute 'cf_consumed' -(Clang-specific)

- -

The 'cf_consumed' attribute is practically identical to ns_consumed. The attribute can be placed on a -specific parameter in either the declaration of a function or an Objective-C -method. It indicates to the static analyzer that the object reference is -implicitly passed to a call to CFRelease upon completion of the call -to the given function or method. The CoreFoundation framework defines a macro -CF_RELEASES_ARGUMENT that is functionally equivalent to the -CF_CONSUMED macro shown below.

- -

Operationally this attribute is nearly identical to 'ns_consumed'.

- -

Example

- -
-$ cat test.m
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_CONSUMED
-#if __has_feature(attribute_cf_consumed)
-#define CF_CONSUMED __attribute__((cf_consumed))
-#else
-#define CF_CONSUMED
-#endif
-#endif
-
-void consume_cf(id CF_CONSUMED x);
-void consume_CFDate(CFDateRef CF_CONSUMED x);
-
-void test() {
-  id x = [[NSObject alloc] init];
-  consume_cf(x); // No leak!
-}
-
-void test2() {
-  CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-  consume_CFDate(date); // No leak, including under GC!
-
-}
-
-@interface Foo : NSObject
-+ (void) releaseArg:(CFDateRef) CF_CONSUMED x;
-@end
-
-void test_method() {
-  CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-  [Foo releaseArg:date]; // No leak!
-}
-
- -

Attribute 'ns_consumes_self' -(Clang-specific)

- -

The 'ns_consumes_self' attribute can be placed only on an Objective-C method -declaration. It indicates that the receiver of the message is -"consumed" (a single reference count decremented) after the message -is sent. This matches the semantics of all "init" methods.

- -

One use of this attribute is declare your own init-like methods that do not -follow the standard Cocoa naming conventions.

- -

Example

- -
-#ifndef __has_feature
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_CONSUMES_SELF
-#if __has_feature((attribute_ns_consumes_self))
-#define NS_CONSUMES_SELF __attribute__((ns_consumes_self))
-#else
-#define NS_CONSUMES_SELF
-#endif
-#endif
-
-@interface MyClass : NSObject
-- initWith:(MyClass *)x;
-- nonstandardInitWith:(MyClass *)x NS_CONSUMES_SELF NS_RETURNS_RETAINED;
-@end
-
- -

In this example, -nonstandardInitWith: has the same ownership -semantics as the init method -initWith:. The static analyzer will -observe that the method consumes the receiver, and then returns an object with -a +1 retain count.

- -

The Foundation framework defines a macro NS_REPLACES_RECEIVER -which is functionally equivalent to the combination of NS_CONSUMES_SELF -and NS_RETURNS_RETAINED shown above.

- -

Libkern Memory Management Annotations

- -

Libkern -requires developers to inherit all heap allocated objects from OSObject -and to perform manual reference counting. -The reference counting model is very similar to MRR (manual retain-release) mode in -Objective-C -or to CoreFoundation reference counting. -Freshly-allocated objects start with a reference count of 1, -and calls to retain increment it, -while calls to release decrement it. -The object is deallocated whenever its reference count reaches zero.

- -

Manually incrementing and decrementing reference counts is error-prone: -over-retains lead to leaks, and over-releases lead to uses-after-free. -The analyzer can help the programmer to check for unbalanced -retain/release calls.

- -

The reference count checking is based on the principle of -locality: it should be possible to establish correctness -(lack of leaks/uses after free) by looking at each function body, -and the declarations (not the definitions) of all the functions it interacts -with.

- -

In order to support such reasoning, it should be possible to summarize -the behavior of each function, with respect to reference count -of its returned values and attributes.

- -

By default, the following summaries are assumed:

-
    -
  • All functions starting with get or Get, - unless they are returning subclasses of OSIterator, - are assumed to be returning at +0. - That is, the caller has no reference - count obligations with respect to the reference count of the returned object - and should leave it untouched. -
  • - -
  • - All other functions are assumed to return at +1. - That is, the caller has an obligation to release such objects. -
  • - -
  • - Functions are assumed not to change the reference count of their parameters, - including the implicit this parameter. -
  • -
- -

These summaries can be overriden with the following -attributes:

- -

Attribute 'os_returns_retained'

- -

The os_returns_retained attribute (accessed through the macro -LIBKERN_RETURNS_RETAINED) plays a role identical to ns_returns_retained for functions -returning OSObject subclasses. -The attribute indicates that it is a callers responsibility to release the -returned object. -

- - -

Attribute 'os_returns_not_retained'

- -

The os_returns_not_retained attribute (accessed through the macro -LIBKERN_RETURNS_NOT_RETAINED) plays a role identical to ns_returns_not_retained for functions -returning OSObject subclasses. -The attribute indicates that the caller should not change the retain -count of the returned object. -

- -
Example
- -
-class MyClass {
-  OSObject *f;
-  LIBKERN_RETURNS_NOT_RETAINED OSObject *myFieldGetter();
-}
-
-
-// Note that the annotation only has to be applied to the function declaration.
-OSObject * MyClass::myFieldGetter() {
-  return f;
-}
-
- -

Attribute 'os_consumed'

- -

Similarly to ns_consumed attribute, -os_consumed (accessed through LIBKERN_CONSUMED) attribute, -applied to a parameter, -indicates that the call to the function consumes the parameter: -the callee should either release it or store it and release it in the destructor, -while the caller should assume one is subtracted from the reference count -after the call.

- -
-IOReturn addToList(LIBKERN_CONSUMED IOPMinformee *newInformee);
-
- -

Attribute 'os_consumes_this'

- -

Similarly to ns_consumes_self, -the os_consumes_self attribute indicates that the method call -consumes the implicit this argument: the caller -should assume one was subtracted from the reference count of the object -after the call, and the callee has on obligation to either -release the argument, or store it and eventually release it in the -destructor.

- -
-void addThisToList(OSArray *givenList) LIBKERN_CONSUMES_THIS;
-
- -

Out Parameters

- -A function can also return an object to a caller by a means of an out parameter -(a pointer-to-OSObject-pointer is passed, and a callee writes a pointer to an -object into an argument). -Currently the analyzer does not track unannotated out -parameters by default, but with annotations we distinguish four separate cases: - -

1. Non-retained out parameters, identified using - LIBKERN_RETURNS_NOT_RETAINED applied to parameters, e.g.:

- -
-void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj)
-
- -

Such functions write a non-retained object into an out parameter, and the -caller has no further obligations.

- -

2. Retained out parameters, -identified using LIBKERN_RETURNS_RETAINED:

-
-void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj)
-
-

-In such cases a retained object is written into an out parameter, which the caller has then to release in order to avoid a leak. -

- -

These two cases are simple - but in practice a functions returning an out-parameter usually also return a return code, and then an out parameter may or may not be written, which conditionally depends on the exit code, e.g.:

- -
-bool maybeCreateObject(LIBKERN_RETURNS_RETAINED OSObject **obj);
-
- -

For such functions, the usual semantics is that an object is written into on "success", and not written into on "failure".

- -

For LIBKERN_RETURNS_RETAINED we assume the following definition of -success:

- -

For functions returning OSReturn or IOReturn -(any typedef to kern_return_t) success is defined as having an output of zero (kIOReturnSuccess is zero). -For all others, success is non-zero (e.g. non-nullptr for pointers)

- -

3. Retained out parameters on zero return -The annotation LIBKERN_RETURNS_RETAINED_ON_ZERO states -that a retained object is written into if and only if the function returns a zero value:

- -
-bool OSUnserializeXML(void *data, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errString);
-
- -

Then the caller has to release an object if the function has returned zero.

- -

4. Retained out parameters on non-zero return -Similarly, LIBKERN_RETURNS_RETAINED_ON_NONZERO specifies that a -retained object is written into the parameter if and only if the function has -returned a non-zero value.

- -

Note that for non-retained out parameters conditionals do not matter, as the -caller has no obligations regardless of whether an object is written into or -not.

- - -

Custom Assertion Handlers

- - -

The analyzer exploits code assertions by pruning off paths where the -assertion condition is false. The idea is capture any program invariants -specified in the assertion that the developer may know but is not immediately -apparent in the code itself. In this way assertions make implicit assumptions -explicit in the code, which not only makes the analyzer more accurate when -finding bugs, but can help others better able to understand your code as well. -It can also help remove certain kinds of analyzer false positives by pruning off -false paths.

- -

In order to exploit assertions, however, the analyzer must understand when it -encounters an "assertion handler." Typically assertions are -implemented with a macro, with the macro performing a check for the assertion -condition and, when the check fails, calling an assertion handler. For example, consider the following code -fragment:

- -
-void foo(int *p) {
-  assert(p != NULL);
-}
-
- -

When this code is preprocessed on Mac OS X it expands to the following:

- -
-void foo(int *p) {
-  (__builtin_expect(!(p != NULL), 0) ? __assert_rtn(__func__, "t.c", 4, "p != NULL") : (void)0);
-}
-
- -

In this example, the assertion handler is __assert_rtn. When called, -most assertion handlers typically print an error and terminate the program. The -analyzer can exploit such semantics by ending the analysis of a path once it -hits a call to an assertion handler.

- -

The trick, however, is that the analyzer needs to know that a called function -is an assertion handler; otherwise the analyzer might assume the function call -returns and it will continue analyzing the path where the assertion condition -failed. This can lead to false positives, as the assertion condition usually -implies a safety condition (e.g., a pointer is not null) prior to performing -some action that depends on that condition (e.g., dereferencing a pointer).

- -

The analyzer knows about several well-known assertion handlers, but can -automatically infer if a function should be treated as an assertion handler if -it is annotated with the 'noreturn' attribute or the (Clang-specific) -'analyzer_noreturn' attribute. Note that, currently, clang does not support -these attributes on Objective-C methods and C++ methods.

- -

Attribute 'noreturn'

- -

The 'noreturn' attribute is a GCC-attribute that can be placed on the -declarations of functions. It means exactly what its name implies: a function -with a 'noreturn' attribute should never return.

- -

Specific details of the syntax of using the 'noreturn' attribute can be found -in GCC's -documentation.

- -

Not only does the analyzer exploit this information when pruning false paths, -but the compiler also takes it seriously and will generate different code (and -possibly better optimized) under the assumption that the function does not -return.

- -

Example

- -

On Mac OS X, the function prototype for __assert_rtn (declared in -assert.h) is specifically annotated with the 'noreturn' attribute:

- -
-void __assert_rtn(const char *, const char *, int, const char *) __attribute__((__noreturn__));
-
- -

Attribute 'analyzer_noreturn' (Clang-specific)

- -

The Clang-specific 'analyzer_noreturn' attribute is almost identical to -'noreturn' except that it is ignored by the compiler for the purposes of code -generation.

- -

This attribute is useful for annotating assertion handlers that actually -can return, but for the purpose of using the analyzer we want to -pretend that such functions do not return.

- -

Because this attribute is Clang-specific, its use should be conditioned with -the use of preprocessor macros.

- -

Example - -

-#ifndef CLANG_ANALYZER_NORETURN
-#if __has_feature(attribute_analyzer_noreturn)
-#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
-#else
-#define CLANG_ANALYZER_NORETURN
-#endif
-#endif
-
-void my_assert_rtn(const char *, const char *, int, const char *) CLANG_ANALYZER_NORETURN;
-
- -
- + + diff --git a/clang/www/analyzer/images/scan_build_cmd.png b/clang/www/analyzer/images/scan_build_cmd.png deleted file mode 100644 index 464fd4e129a20..0000000000000 Binary files a/clang/www/analyzer/images/scan_build_cmd.png and /dev/null differ diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index f2716f1e4c653..564502c1f3e92 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -1065,7 +1065,7 @@

C++ defect report implementation status

170 DRWP Pointer-to-member conversions - Unknown + Clang 3.1 171 diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index bd85c50a083a6..7a1a47a78dbc6 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -636,12 +636,17 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { case 0xFF8B: // 8B FF : mov edi, edi case 0xEC8B: // 8B EC : mov ebp, esp case 0xc889: // 89 C8 : mov eax, ecx + case 0xD189: // 89 D1 : mov ecx, edx case 0xE589: // 89 E5 : mov ebp, esp case 0xC18B: // 8B C1 : mov eax, ecx + case 0xC031: // 31 C0 : xor eax, eax + case 0xC931: // 31 C9 : xor ecx, ecx + case 0xD231: // 31 D2 : xor edx, edx case 0xC033: // 33 C0 : xor eax, eax case 0xC933: // 33 C9 : xor ecx, ecx case 0xD233: // 33 D2 : xor edx, edx case 0xDB84: // 84 DB : test bl,bl + case 0xC084: // 84 C0 : test al,al case 0xC984: // 84 C9 : test cl,cl case 0xD284: // 84 D2 : test dl,dl return 2; diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp index 3a2d8b271113d..e0258a3d0bd51 100644 --- a/compiler-rt/lib/interception/tests/interception_win_test.cpp +++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp @@ -839,14 +839,19 @@ const struct InstructionSizeData { { 1, {0x90}, 0, "90 : nop"}, { 1, {0xC3}, 0, "C3 : ret (for small/empty function interception"}, { 1, {0xCC}, 0, "CC : int 3 i.e. registering weak functions)"}, + { 2, {0x31, 0xC0}, 0, "31 C0 : xor eax, eax"}, + { 2, {0x31, 0xC9}, 0, "31 C9 : xor ecx, ecx"}, + { 2, {0x31, 0xD2}, 0, "31 D2 : xor edx, edx"}, { 2, {0x33, 0xC0}, 0, "33 C0 : xor eax, eax"}, { 2, {0x33, 0xC9}, 0, "33 C9 : xor ecx, ecx"}, { 2, {0x33, 0xD2}, 0, "33 D2 : xor edx, edx"}, { 2, {0x6A, 0x71}, 0, "6A XX : push XX"}, + { 2, {0x84, 0xC0}, 0, "84 C0 : test al,al"}, { 2, {0x84, 0xC9}, 0, "84 C9 : test cl,cl"}, { 2, {0x84, 0xD2}, 0, "84 D2 : test dl,dl"}, { 2, {0x84, 0xDB}, 0, "84 DB : test bl,bl"}, { 2, {0x89, 0xc8}, 0, "89 C8 : mov eax, ecx"}, + { 2, {0x89, 0xD1}, 0, "89 D1 : mov ecx, edx"}, { 2, {0x89, 0xE5}, 0, "89 E5 : mov ebp, esp"}, { 2, {0x8A, 0x01}, 0, "8A 01 : mov al, byte ptr [ecx]"}, { 2, {0x8B, 0xC1}, 0, "8B C1 : mov eax, ecx"}, diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp index 9c87b4782671a..f0230df9260e3 100644 --- a/compiler-rt/lib/tysan/tysan.cpp +++ b/compiler-rt/lib/tysan/tysan.cpp @@ -131,6 +131,17 @@ static bool isAliasingLegalUp(tysan_type_descriptor *TDA, break; } + // This offset can't be negative. Therefore we must be accessing something + // before the current type (not legal) or partially inside the last type. + // In the latter case, we adjust Idx. + if (TDA->Struct.Members[Idx].Offset > OffsetA) { + // Trying to access something before the current type. + if (!Idx) + return false; + + Idx -= 1; + } + OffsetA -= TDA->Struct.Members[Idx].Offset; TDA = TDA->Struct.Members[Idx].Type; } else { diff --git a/compiler-rt/test/tysan/struct-offset-different-base.cpp b/compiler-rt/test/tysan/struct-offset-different-base.cpp new file mode 100644 index 0000000000000..862595de8dc81 --- /dev/null +++ b/compiler-rt/test/tysan/struct-offset-different-base.cpp @@ -0,0 +1,49 @@ +// RUN: %clangxx_tysan -O0 %s -o %t && %run %t >%t.out 2>&1 +// RUN: FileCheck %s --implicit-check-not ERROR < %t.out + +// Modified reproducer from https://github.com/llvm/llvm-project/issues/105960 + +#include + +struct inner1 { + char buffer; + int i; +}; + +struct inner2 { + char buffer; + int i; + float endBuffer; +}; + +void init_inner1(inner1 *iPtr) { iPtr->i = 200; } +void init_inner2(inner2 *iPtr) { + iPtr->i = 400; + iPtr->endBuffer = 413.0f; +} + +struct outer { + inner1 foo; + inner2 bar; + char buffer; +}; + +int main(void) { + outer *l = new outer(); + + init_inner1(&l->foo); + init_inner2(&l->bar); + + int access = l->foo.i; + printf("Accessed value 1 is %d\n", access); + access = l->bar.i; + printf("Accessed value 2 is %d\n", access); + float fAccess = l->bar.endBuffer; + printf("Accessed value 3 is %f\n", fAccess); + + return 0; +} + +// CHECK: Accessed value 1 is 200 +// CHECK: Accessed value 2 is 400 +// CHECK: Accessed value 3 is 413.0 diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py index 29d7867e80867..4496fdf3cb0e8 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py @@ -98,7 +98,7 @@ def _build_command( def label_to_line(label_name: str) -> int: line = labels.get(label_name, None) - if line != None: + if line is not None: return line raise format_unresolved_label_err(label_name, raw_text, path.base, lineno) @@ -128,7 +128,7 @@ def get_address_object(address_name: str, offset: int = 0): def _search_line_for_cmd_start(line: str, start: int, valid_commands: dict) -> int: - """Scan `line` for a string matching any key in `valid_commands`. + r"""Scan `line` for a string matching any key in `valid_commands`. Start searching from `start`. Commands escaped with `\` (E.g. `\DexLabel('a')`) are ignored. @@ -543,7 +543,7 @@ def test_parse_share_line(self): def test_parse_escaped(self): """Escaped commands are ignored.""" - lines = ['words \MockCmd("IGNORED") words words words\n'] + lines = ['words \\MockCmd("IGNORED") words words words\n'] values = self._find_all_mock_values_in_lines(lines) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py index a7d6b570b55e8..ac3054c3a0edf 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py @@ -62,7 +62,7 @@ def __init__( self.finish_on_remove = finish_on_remove def has_conditions(self): - return self.expression != None + return self.expression is not None def get_conditional_expression_list(self): conditional_list = [] @@ -76,7 +76,7 @@ def add_hit(self): self.current_hit_count += 1 def should_be_removed(self): - if self.max_hit_count == None: + if self.max_hit_count is None: return False return self.current_hit_count >= self.max_hit_count diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py index 3e5a7b919d703..a4ca5ae0158e9 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py @@ -39,7 +39,7 @@ def update_step_watches(step_info, watches, commands): for watch in towatch: loc = step_info.current_location if ( - loc.path != None + loc.path is not None and os.path.exists(loc.path) and os.path.samefile(watch.path, loc.path) and have_hit_line(watch, loc) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py index 1b0d4d5871cbe..67b715af78698 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py @@ -183,7 +183,7 @@ def handle_debugger_tool_options(context, defaults): # noqa if options.debugger == "lldb": _warn_meaningless_option(context, "--show-debugger") - if options.source_root_dir != None: + if options.source_root_dir is not None: if not os.path.isabs(options.source_root_dir): raise ToolArgumentError( f'--source-root-dir: expected absolute path, got "{options.source_root_dir}"' diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py index a6752274efac2..a7f12cde1f047 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py @@ -256,7 +256,7 @@ def delete_breakpoints(self, ids): for bp in self._debugger.Breakpoints: # We're looking at the user-set breakpoints so there should be no # Parent. - assert bp.Parent == None + assert bp.Parent is None this_vsbp = VSBreakpoint( PurePath(bp.File), bp.FileLine, bp.FileColumn, bp.Condition ) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py index f07641041254b..c366062cec7a9 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py @@ -150,7 +150,7 @@ def _get_results_path(self, test_name): """Returns the path to the test results directory for the test denoted by test_name. """ - assert self.context.options.results_directory != None + assert self.context.options.results_directory is not None return os.path.join( self.context.options.results_directory, self._get_results_basename(test_name), diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py index 9935fe6a199da..66fdd63632885 100644 --- a/cross-project-tests/lit.cfg.py +++ b/cross-project-tests/lit.cfg.py @@ -51,7 +51,7 @@ def get_required_attr(config, attr_name): attr_value = getattr(config, attr_name, None) - if attr_value == None: + if attr_value is None: lit_config.fatal( "No attribute %r in test configuration! You may need to run " "tests from your build directory or add this attribute " @@ -223,7 +223,7 @@ def can_target_host(): xcode_lldb_vers = subprocess.check_output(["xcrun", "lldb", "--version"]).decode( "utf-8" ) - match = re.search("lldb-(\d+)", xcode_lldb_vers) + match = re.search(r"lldb-(\d+)", xcode_lldb_vers) if match: apple_lldb_vers = int(match.group(1)) if apple_lldb_vers < 1000: @@ -247,7 +247,7 @@ def get_gdb_version_string(): if len(gdb_vers_lines) < 1: print("Unkown GDB version format (too few lines)", file=sys.stderr) return None - match = re.search("GNU gdb \(.*?\) ((\d|\.)+)", gdb_vers_lines[0].strip()) + match = re.search(r"GNU gdb \(.*?\) ((\d|\.)+)", gdb_vers_lines[0].strip()) if match is None: print(f"Unkown GDB version format: {gdb_vers_lines[0]}", file=sys.stderr) return None @@ -261,7 +261,7 @@ def get_clang_default_dwarf_version_string(triple): # Get the flags passed by the driver and look for -dwarf-version. cmd = f'{llvm_config.use_llvm_tool("clang")} -g -xc -c - -v -### --target={triple}' stderr = subprocess.run(cmd.split(), stderr=subprocess.PIPE).stderr.decode() - match = re.search("-dwarf-version=(\d+)", stderr) + match = re.search(r"-dwarf-version=(\d+)", stderr) if match is None: print("Cannot determine default dwarf version", file=sys.stderr) return None diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 68947eaa9c9bd..b619553ef8302 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -91,28 +91,37 @@ if (FLANG_STANDALONE_BUILD) # If the user specifies a relative path to LLVM_DIR, the calls to include # LLVM modules fail. Append the absolute path to LLVM_DIR instead. - get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR} - REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) - list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE}) + if (LLVM_DIR) + get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR} + REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE}) + endif() # We need a pre-built/installed version of LLVM. find_package(LLVM REQUIRED HINTS "${LLVM_DIR_ABSOLUTE}") + if (NOT LLVM_DIR_ABSOLUTE) + # If the user did not specify a LLVM_DIR (and therefore LLVM_DIR_ABSOLUTE + # was not set), append the discovered path to CMAKE_MODULE_PATH. + list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR}) + endif() # Users might specify a path to CLANG_DIR that's: # * a full path, or # * a path relative to the path of this script. # Append the absolute path to CLANG_DIR so that find_package works in both # cases. - get_filename_component( - CLANG_DIR_ABSOLUTE - ${CLANG_DIR} - REALPATH - BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) - list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR_ABSOLUTE}) - - # TODO: Remove when libclangDriver is lifted out of Clang - find_package(Clang REQUIRED PATHS "${CLANG_DIR_ABSOLUTE}" NO_DEFAULT_PATH) - if (NOT Clang_FOUND) - message(FATAL_ERROR "Failed to find Clang") + if (CLANG_DIR) + get_filename_component( + CLANG_DIR_ABSOLUTE + ${CLANG_DIR} + REALPATH + BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR_ABSOLUTE}) + + # TODO: Remove when libclangDriver is lifted out of Clang + find_package(Clang REQUIRED PATHS "${CLANG_DIR_ABSOLUTE}" NO_DEFAULT_PATH) + else() + find_package(Clang REQUIRED) + list(APPEND CMAKE_MODULE_PATH ${Clang_DIR}) endif() # If LLVM links to zlib we need the imported targets so we can too. @@ -134,10 +143,15 @@ if (FLANG_STANDALONE_BUILD) include(TableGen) # If the user specifies a relative path to MLIR_DIR, the calls to include # MLIR modules fail. Append the absolute path to MLIR_DIR instead. - get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR} - REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) - list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE}) + if (MLIR_DIR) + get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR} + REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE}) + endif() find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR_ABSOLUTE}) + if (NOT MLIR_DIR_ABSOLUTE) + list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR}) + endif() # Use SYSTEM for the same reasons as for LLVM includes include_directories(SYSTEM ${MLIR_INCLUDE_DIRS}) include(AddMLIR) diff --git a/flang/include/flang/Optimizer/CodeGen/TypeConverter.h b/flang/include/flang/Optimizer/CodeGen/TypeConverter.h index 7c317ddeea1fa..20270d41b1e9a 100644 --- a/flang/include/flang/Optimizer/CodeGen/TypeConverter.h +++ b/flang/include/flang/Optimizer/CodeGen/TypeConverter.h @@ -62,7 +62,7 @@ class LLVMTypeConverter : public mlir::LLVMTypeConverter { // fir.type --> llvm<"%name = { ty... }"> std::optional convertRecordType(fir::RecordType derived, - llvm::SmallVectorImpl &results); + llvm::SmallVectorImpl &results, bool isPacked); // Is an extended descriptor needed given the element type of a fir.box type ? // Extended descriptors are required for derived types. diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td index 3919c9191c212..6ae74f16a72d3 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td +++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td @@ -346,6 +346,12 @@ def fir_RecordType : FIR_Type<"Record", "type"> { void finalize(llvm::ArrayRef lenPList, llvm::ArrayRef typeList); + // fir.type is unpacked by default. If the flag is set, the packed fir.type + // is generated and the alignment is enforced by explicit padding by i8 + // array fields. + bool isPacked() const; + void pack(bool); + detail::RecordTypeStorage const *uniqueKey() const; }]; } diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp index 452ddda426fa1..31b85ef2b5476 100644 --- a/flang/lib/Lower/ConvertType.cpp +++ b/flang/lib/Lower/ConvertType.cpp @@ -20,6 +20,8 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinTypes.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/Triple.h" #define DEBUG_TYPE "flang-lower-type" @@ -385,9 +387,20 @@ struct TypeBuilderImpl { // with dozens of components/parents (modern Fortran). derivedTypeInConstruction.try_emplace(&derivedScope, rec); + auto targetTriple{llvm::Triple( + llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()))}; + // Always generate packed FIR struct type for bind(c) derived type for AIX + if (targetTriple.getOS() == llvm::Triple::OSType::AIX && + tySpec.typeSymbol().attrs().test(Fortran::semantics::Attr::BIND_C) && + !IsIsoCType(&tySpec)) { + rec.pack(true); + } + // Gather the record type fields. // (1) The data components. if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { + size_t prev_offset{0}; + unsigned padCounter{0}; // In HLFIR the parent component is the first fir.type component. for (const auto &componentName : typeSymbol.get() @@ -397,7 +410,38 @@ struct TypeBuilderImpl { "failed to find derived type component symbol"); const Fortran::semantics::Symbol &component = scopeIter->second.get(); mlir::Type ty = genSymbolType(component); + if (rec.isPacked()) { + auto compSize{component.size()}; + auto compOffset{component.offset()}; + + if (prev_offset < compOffset) { + size_t pad{compOffset - prev_offset}; + mlir::Type i8Ty{mlir::IntegerType::get(context, 8)}; + fir::SequenceType::Shape shape{static_cast(pad)}; + mlir::Type padTy{fir::SequenceType::get(shape, i8Ty)}; + prev_offset += pad; + cs.emplace_back("__padding" + std::to_string(padCounter++), padTy); + } + prev_offset += compSize; + } cs.emplace_back(converter.getRecordTypeFieldName(component), ty); + if (rec.isPacked()) { + // For the last component, determine if any padding is needed. + if (componentName == + typeSymbol.get() + .componentNames() + .back()) { + auto compEnd{component.offset() + component.size()}; + if (compEnd < derivedScope.size()) { + size_t pad{derivedScope.size() - compEnd}; + mlir::Type i8Ty{mlir::IntegerType::get(context, 8)}; + fir::SequenceType::Shape shape{static_cast(pad)}; + mlir::Type padTy{fir::SequenceType::get(shape, i8Ty)}; + cs.emplace_back("__padding" + std::to_string(padCounter++), + padTy); + } + } + } } } else { for (const auto &component : diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index c71fd598d5c8a..8a1029426d30c 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1336,19 +1336,18 @@ static void genWorkshareClauses(lower::AbstractConverter &converter, cp.processNowait(clauseOps); } -static void genTeamsClauses(lower::AbstractConverter &converter, - semantics::SemanticsContext &semaCtx, - lower::StatementContext &stmtCtx, - const List &clauses, mlir::Location loc, - mlir::omp::TeamsOperands &clauseOps) { +static void genTeamsClauses( + lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, + lower::StatementContext &stmtCtx, const List &clauses, + mlir::Location loc, mlir::omp::TeamsOperands &clauseOps, + llvm::SmallVectorImpl &reductionSyms) { ClauseProcessor cp(converter, semaCtx, clauses); cp.processAllocate(clauseOps); cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps); cp.processNumTeams(stmtCtx, clauseOps); cp.processThreadLimit(stmtCtx, clauseOps); + cp.processReduction(loc, clauseOps, reductionSyms); // TODO Support delayed privatization. - - cp.processTODO(loc, llvm::omp::Directive::OMPD_teams); } static void genWsloopClauses( @@ -2015,13 +2014,29 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item) { lower::StatementContext stmtCtx; + mlir::omp::TeamsOperands clauseOps; - genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps); + llvm::SmallVector reductionSyms; + genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps, + reductionSyms); + + EntryBlockArgs args; + // TODO: Add private syms and vars. + args.reduction.syms = reductionSyms; + args.reduction.vars = clauseOps.reductionVars; + + auto genRegionEntryCB = [&](mlir::Operation *op) { + genEntryBlock(converter.getFirOpBuilder(), args, op->getRegion(0)); + bindEntryBlockArgs( + converter, llvm::cast(op), args); + return llvm::to_vector(args.getSyms()); + }; return genOpWithBody( OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval, llvm::omp::Directive::OMPD_teams) - .setClauses(&item->clauses), + .setClauses(&item->clauses) + .setGenRegionEntryCb(genRegionEntryCB), queue, item, clauseOps); } diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp index 104ae7408b80c..ad7272eaa9d3f 100644 --- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp +++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp @@ -167,6 +167,7 @@ class BoxprocTypeRewriter : public mlir::TypeConverter { cs.emplace_back(t.first, t.second); } rec.finalize(ps, cs); + rec.pack(ty.isPacked()); return rec; }); addConversion([&](TypeDescType ty) { diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp index c23203efcd3df..0eace903720f0 100644 --- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp +++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp @@ -82,7 +82,7 @@ LLVMTypeConverter::LLVMTypeConverter(mlir::ModuleOp module, bool applyTBAA, [&](fir::PointerType pointer) { return convertPointerLike(pointer); }); addConversion( [&](fir::RecordType derived, llvm::SmallVectorImpl &results) { - return convertRecordType(derived, results); + return convertRecordType(derived, results, derived.isPacked()); }); addConversion( [&](fir::ReferenceType ref) { return convertPointerLike(ref); }); @@ -133,8 +133,10 @@ mlir::Type LLVMTypeConverter::indexType() const { } // fir.type --> llvm<"%name = { ty... }"> -std::optional LLVMTypeConverter::convertRecordType( - fir::RecordType derived, llvm::SmallVectorImpl &results) { +std::optional +LLVMTypeConverter::convertRecordType(fir::RecordType derived, + llvm::SmallVectorImpl &results, + bool isPacked) { auto name = fir::NameUniquer::dropTypeConversionMarkers(derived.getName()); auto st = mlir::LLVM::LLVMStructType::getIdentified(&getContext(), name); @@ -156,7 +158,7 @@ std::optional LLVMTypeConverter::convertRecordType( else members.push_back(mlir::cast(convertType(mem.second))); } - if (mlir::failed(st.setBody(members, /*isPacked=*/false))) + if (mlir::failed(st.setBody(members, isPacked))) return mlir::failure(); results.push_back(st); return mlir::success(); diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp index cba7fa6412850..d25e5651f1142 100644 --- a/flang/lib/Optimizer/Dialect/FIRType.cpp +++ b/flang/lib/Optimizer/Dialect/FIRType.cpp @@ -165,16 +165,20 @@ struct RecordTypeStorage : public mlir::TypeStorage { setTypeList(typeList); } + bool isPacked() const { return packed; } + void pack(bool p) { packed = p; } + protected: std::string name; bool finalized; + bool packed; std::vector lens; std::vector types; private: RecordTypeStorage() = delete; explicit RecordTypeStorage(llvm::StringRef name) - : name{name}, finalized{false} {} + : name{name}, finalized{false}, packed{false} {} }; } // namespace detail @@ -872,9 +876,14 @@ llvm::LogicalResult fir::PointerType::verify( //===----------------------------------------------------------------------===// // Fortran derived type +// unpacked: // `type` `<` name // (`(` id `:` type (`,` id `:` type)* `)`)? // (`{` id `:` type (`,` id `:` type)* `}`)? '>' +// packed: +// `type` `<` name +// (`(` id `:` type (`,` id `:` type)* `)`)? +// (`<{` id `:` type (`,` id `:` type)* `}>`)? '>' mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) { llvm::StringRef name; if (parser.parseLess() || parser.parseKeyword(&name)) @@ -900,6 +909,10 @@ mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) { } RecordType::TypeList typeList; + if (!parser.parseOptionalLess()) { + result.pack(true); + } + if (!parser.parseOptionalLBrace()) { while (true) { llvm::StringRef field; @@ -913,8 +926,10 @@ mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) { if (parser.parseOptionalComma()) break; } - if (parser.parseRBrace()) - return {}; + if (parser.parseOptionalGreater()) { + if (parser.parseRBrace()) + return {}; + } } if (parser.parseGreater()) @@ -941,6 +956,9 @@ void fir::RecordType::print(mlir::AsmPrinter &printer) const { printer << ')'; } if (getTypeList().size()) { + if (isPacked()) { + printer << '<'; + } char ch = '{'; for (auto p : getTypeList()) { printer << ch << p.first << ':'; @@ -948,6 +966,9 @@ void fir::RecordType::print(mlir::AsmPrinter &printer) const { ch = ','; } printer << '}'; + if (isPacked()) { + printer << '>'; + } } recordTypeVisited.erase(uniqueKey()); } @@ -973,6 +994,10 @@ RecordType::TypeList fir::RecordType::getLenParamList() const { bool fir::RecordType::isFinalized() const { return getImpl()->isFinalized(); } +void fir::RecordType::pack(bool p) { getImpl()->pack(p); } + +bool fir::RecordType::isPacked() const { return getImpl()->isPacked(); } + detail::RecordTypeStorage const *fir::RecordType::uniqueKey() const { return getImpl(); } diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp index 94640fa30baa5..6d4fce2f00a6d 100644 --- a/flang/lib/Semantics/compute-offsets.cpp +++ b/flang/lib/Semantics/compute-offsets.cpp @@ -17,6 +17,8 @@ #include "flang/Semantics/symbol.h" #include "flang/Semantics/tools.h" #include "flang/Semantics/type.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/Triple.h" #include #include @@ -51,9 +53,12 @@ class ComputeOffsetsHelper { SymbolAndOffset Resolve(const SymbolAndOffset &); std::size_t ComputeOffset(const EquivalenceObject &); // Returns amount of padding that was needed for alignment - std::size_t DoSymbol(Symbol &); + std::size_t DoSymbol( + Symbol &, std::optional newAlign = std::nullopt); SizeAndAlignment GetSizeAndAlignment(const Symbol &, bool entire); std::size_t Align(std::size_t, std::size_t); + std::optional CompAlignment(const Symbol &); + std::optional HasSpecialAlign(const Symbol &, Scope &); SemanticsContext &context_; std::size_t offset_{0}; @@ -65,6 +70,69 @@ class ComputeOffsetsHelper { equivalenceBlock_; }; +// This function is only called if the target platform is AIX. +static bool isReal8OrLarger(const Fortran::semantics::DeclTypeSpec *type) { + return ((type->IsNumeric(common::TypeCategory::Real) || + type->IsNumeric(common::TypeCategory::Complex)) && + evaluate::ToInt64(type->numericTypeSpec().kind()) > 4); +} + +// This function is only called if the target platform is AIX. +// It determines the alignment of a component. If the component is a derived +// type, the alignment is computed accordingly. +std::optional ComputeOffsetsHelper::CompAlignment(const Symbol &sym) { + size_t max_align{0}; + constexpr size_t fourByteAlign{4}; + bool contain_double{false}; + auto derivedTypeSpec{sym.GetType()->AsDerived()}; + DirectComponentIterator directs{*derivedTypeSpec}; + for (auto it{directs.begin()}; it != directs.end(); ++it) { + auto type{it->GetType()}; + auto s{GetSizeAndAlignment(*it, true)}; + if (isReal8OrLarger(type)) { + max_align = std::max(max_align, fourByteAlign); + contain_double = true; + } else if (type->AsDerived()) { + if (const auto newAlgin{CompAlignment(*it)}) { + max_align = std::max(max_align, s.alignment); + } else { + return std::nullopt; + } + } else { + max_align = std::max(max_align, s.alignment); + } + } + + if (contain_double) { + return max_align; + } else { + return std::nullopt; + } +} + +// This function is only called if the target platform is AIX. +// Special alignment is needed only if it is a bind(c) derived type +// and contain real type components that have larger than 4 bytes. +std::optional ComputeOffsetsHelper::HasSpecialAlign( + const Symbol &sym, Scope &scope) { + // On AIX, if the component that is not the first component and is + // a float of 8 bytes or larger, it has the 4-byte alignment. + // Only set the special alignment for bind(c) derived type on that platform. + if (const auto type{sym.GetType()}) { + auto &symOwner{sym.owner()}; + if (symOwner.symbol() && symOwner.IsDerivedType() && + symOwner.symbol()->attrs().HasAny({semantics::Attr::BIND_C}) && + &sym != &(*scope.GetSymbols().front())) { + if (isReal8OrLarger(type)) { + return 4UL; + } else if (type->AsDerived()) { + return CompAlignment(sym); + } + } + } + return std::nullopt; +} + void ComputeOffsetsHelper::Compute(Scope &scope) { for (Scope &child : scope.children()) { ComputeOffsets(context_, child); @@ -113,7 +181,15 @@ void ComputeOffsetsHelper::Compute(Scope &scope) { if (!FindCommonBlockContaining(*symbol) && dependents_.find(symbol) == dependents_.end() && equivalenceBlock_.find(symbol) == equivalenceBlock_.end()) { - DoSymbol(*symbol); + + std::optional newAlign{std::nullopt}; + // Handle special alignment requirement for AIX + auto triple{llvm::Triple( + llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()))}; + if (triple.getOS() == llvm::Triple::OSType::AIX) { + newAlign = HasSpecialAlign(*symbol, scope); + } + DoSymbol(*symbol, newAlign); if (auto *generic{symbol->detailsIf()}) { if (Symbol * specific{generic->specific()}; specific && !FindCommonBlockContaining(*specific)) { @@ -313,7 +389,8 @@ std::size_t ComputeOffsetsHelper::ComputeOffset( return result; } -std::size_t ComputeOffsetsHelper::DoSymbol(Symbol &symbol) { +std::size_t ComputeOffsetsHelper::DoSymbol( + Symbol &symbol, std::optional newAlign) { if (!symbol.has() && !symbol.has()) { return 0; } @@ -322,12 +399,13 @@ std::size_t ComputeOffsetsHelper::DoSymbol(Symbol &symbol) { return 0; } std::size_t previousOffset{offset_}; - offset_ = Align(offset_, s.alignment); + size_t alignVal{newAlign.value_or(s.alignment)}; + offset_ = Align(offset_, alignVal); std::size_t padding{offset_ - previousOffset}; symbol.set_size(s.size); symbol.set_offset(offset_); offset_ += s.size; - alignment_ = std::max(alignment_, s.alignment); + alignment_ = std::max(alignment_, alignVal); return padding; } diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf index 2eac890970d52..561d92ecd3e2e 100644 --- a/flang/test/Lower/CUDA/cuda-devptr.cuf +++ b/flang/test/Lower/CUDA/cuda-devptr.cuf @@ -38,8 +38,8 @@ end ! CHECK-LABEL: func.func @_QPsub2() ! CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub2Ex"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) -! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}> -! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref}>>, !fir.field) -> !fir.ref> +! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}> +! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref}{{[>]?}}>>, !fir.field) -> !fir.ref> ! CHECK: %[[ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> ! CHECK: %[[ADDRESS_COORD:.*]] = fir.coordinate_of %[[CPTR_COORD]], %[[ADDRESS]] : (!fir.ref>, !fir.field) -> !fir.ref ! CHECK: %[[ADDRESS_LOADED:.*]] = fir.load %[[ADDRESS_COORD]] : !fir.ref diff --git a/flang/test/Lower/HLFIR/bindc-value-derived.f90 b/flang/test/Lower/HLFIR/bindc-value-derived.f90 index 7a2196dfc8bf1..5af9f8edc804c 100644 --- a/flang/test/Lower/HLFIR/bindc-value-derived.f90 +++ b/flang/test/Lower/HLFIR/bindc-value-derived.f90 @@ -14,11 +14,11 @@ subroutine test(x) bind(c) call use_it(x%i) end subroutine ! CHECK-LABEL: func.func @test( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{i:i32}> -! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{i:i32}> -! CHECK: fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) -! CHECK: %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"} : (!fir.ref>) -> !fir.ref +! CHECK-SAME: %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}> +! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}> +! CHECK: fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref]?}}>> +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref]?}}>>, !fir.dscope) -> (!fir.ref]?}}>>, !fir.ref]?}}>>) +! CHECK: %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"} : (!fir.ref]?}}>>) -> !fir.ref ! CHECK: fir.call @_QPuse_it(%[[VAL_3]]) fastmath : (!fir.ref) -> () ! CHECK: return ! CHECK: } @@ -28,10 +28,10 @@ subroutine call_it(x) call test(x) end subroutine ! CHECK-LABEL: func.func @_QMbindc_byvalPcall_it( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) -! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref> -! CHECK: fir.call @test(%[[VAL_2]]) proc_attrs fastmath : (!fir.type<_QMbindc_byvalTt{i:i32}>) -> () +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref]?}}>> +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref]?}}>>, !fir.dscope) -> (!fir.ref]?}}>>, !fir.ref]?}}>>) +! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref]?}}>> +! CHECK: fir.call @test(%[[VAL_2]]) proc_attrs fastmath : (!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>) -> () ! CHECK: return ! CHECK: } end module diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 index d0ed0cbb4c831..8daf20e1ae400 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 @@ -1,10 +1,10 @@ ! This test checks lowering of OpenMP allocate Directive with align clause. -// RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s +! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s 2>&1 | FileCheck %s program main integer :: x - // CHECK: not yet implemented: OpenMPDeclarativeAllocate + ! CHECK: not yet implemented: OpenMPDeclarativeAllocate !$omp allocate(x) align(32) end diff --git a/flang/test/Lower/OpenMP/Todo/reduction-teams.f90 b/flang/test/Lower/OpenMP/Todo/reduction-teams.f90 deleted file mode 100644 index db4839593c7e7..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/reduction-teams.f90 +++ /dev/null @@ -1,12 +0,0 @@ -! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s -! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s - -! CHECK: not yet implemented: Unhandled clause REDUCTION in TEAMS construct -subroutine reduction_teams() - integer :: i - i = 0 - - !$omp teams reduction(+:i) - i = i + 1 - !$omp end teams -end subroutine reduction_teams diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90 index f3d147c10668f..9e9ccf8e3d914 100644 --- a/flang/test/Lower/OpenMP/copyin.f90 +++ b/flang/test/Lower/OpenMP/copyin.f90 @@ -86,7 +86,7 @@ subroutine copyin_char_chararray() end ! CHECK-LABEL: func.func @_QPcopyin_derived_type() { -! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeE.b.my_type.t_arr) : !fir.ref,value:i64}>>> +! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeE.b.my_type.t_arr) : !fir.ref,value:i64}{{[>]?}}>>> ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index diff --git a/flang/test/Lower/OpenMP/reduction-teams.f90 b/flang/test/Lower/OpenMP/reduction-teams.f90 new file mode 100644 index 0000000000000..6997e774c2d42 --- /dev/null +++ b/flang/test/Lower/OpenMP/reduction-teams.f90 @@ -0,0 +1,18 @@ +! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: omp.declare_reduction @[[RED:.*]] : i32 init { + +! CHECK: func.func @_QPreduction_teams() { +subroutine reduction_teams() + integer :: i + i = 0 + + ! CHECK: omp.teams reduction(@[[RED]] %{{.*}}#0 -> %[[PRIV_I:.*]] : !fir.ref) { + !$omp teams reduction(+:i) + ! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[PRIV_I]] + ! CHECK: %{{.*}} = fir.load %[[DECL_I]]#0 : !fir.ref + ! CHECK: hlfir.assign %{{.*}} to %[[DECL_I]]#0 : i32, !fir.ref + i = i + 1 + !$omp end teams +end subroutine reduction_teams diff --git a/flang/test/Lower/derived-types-bindc.f90 b/flang/test/Lower/derived-types-bindc.f90 new file mode 100644 index 0000000000000..309b2b7f5f492 --- /dev/null +++ b/flang/test/Lower/derived-types-bindc.f90 @@ -0,0 +1,44 @@ +! Test padding for BIND(C) derived types lowering for AIX target +! RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck %s + +! REQUIRES: target={{.+}}-aix{{.*}} + +subroutine s1() + use, intrinsic :: iso_c_binding + type, bind(c) :: t0 + character(c_char) :: x1 + real(c_double) :: x2 + end type + type(t0) :: xt0 +! CHECK-DAG: %_QFs1Tt0 = type <{ [1 x i8], [3 x i8], double }> + + type, bind(c) :: t1 + integer(c_short) :: x1 + real(c_double) :: x2 + end type + type(t1) :: xt1 +! CHECK-DAG: %_QFs1Tt1 = type <{ i16, [2 x i8], double }> + + type, bind(c) :: t2 + integer(c_short) :: x1 + real(c_double) :: x2 + character(c_char) :: x3 + end type + type(t2) :: xt2 +! CHECK-DAG: %_QFs1Tt2 = type <{ i16, [2 x i8], double, [1 x i8], [3 x i8] }> + + type, bind(c) :: t3 + character(c_char) :: x1 + complex(c_double_complex) :: x2 + end type + type(t3) :: xt3 +! CHECK-DAG: %_QFs1Tt3 = type <{ [1 x i8], [3 x i8], { double, double } }> + + type, bind(c) :: t4 + integer(c_short) :: x1 + complex(c_double_complex) :: x2 + character(c_char) :: x3 + end type + type(t4) :: xt4 +! CHECK-DAG: %_QFs1Tt4 = type <{ i16, [2 x i8], { double, double }, [1 x i8], [3 x i8] }> +end subroutine s1 diff --git a/flang/test/Lower/intentout-deallocate.f90 b/flang/test/Lower/intentout-deallocate.f90 index 8e7ccbcc9fdb9..931cf7d48885f 100644 --- a/flang/test/Lower/intentout-deallocate.f90 +++ b/flang/test/Lower/intentout-deallocate.f90 @@ -123,24 +123,24 @@ subroutine sub5(t) ! on the caller side. ! CHECK-LABEL: func.func @_QMmod1Psub4() -! FIR: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "t", uniq_name = "_QMmod1Fsub4Et"} +! FIR: %[[BOX:.*]] = fir.alloca !fir.box]?}}>>> {bindc_name = "t", uniq_name = "_QMmod1Fsub4Et"} ! HLFIR: %[[BOX:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub4Et" ! CHECK-NOT: fir.call @_FortranAAllocatableDeallocate -! CHECK: fir.call @_QMmod1Psub5(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref>>>) -> () +! CHECK: fir.call @_QMmod1Psub5(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref]?}}>>>>) -> () ! Check deallocation of allocatble intent(out) on the callee side. Deallocation ! is done with a runtime call. ! CHECK-LABEL: func.func @_QMmod1Psub5( -! FIR-SAME: %[[ARG0:.*]]: !fir.ref>>> {fir.bindc_name = "t"}) +! FIR-SAME: %[[ARG0:.*]]: !fir.ref]?}}>>>> {fir.bindc_name = "t"}) ! HLFIR: %[[ARG0:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub5Et" -! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]]{{[#1]*}} : !fir.ref>>> -! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box>>) -> !fir.heap> -! CHECK: %[[BOX_ADDR_PTR:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.heap>) -> i64 +! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]]{{[#1]*}} : !fir.ref]?}}>>>> +! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box]?}}>>>) -> !fir.heap]?}}>> +! CHECK: %[[BOX_ADDR_PTR:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.heap]?}}>>) -> i64 ! CHECK: %[[C0:.*]] = arith.constant 0 : i64 ! CHECK: %[[IS_ALLOCATED:.*]] = arith.cmpi ne, %[[BOX_ADDR_PTR]], %[[C0]] : i64 ! CHECK: fir.if %[[IS_ALLOCATED]] { -! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]]{{[#1]*}} : (!fir.ref>>>) -> !fir.ref> +! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]]{{[#1]*}} : (!fir.ref]?}}>>>>) -> !fir.ref> ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 subroutine sub6() @@ -152,11 +152,11 @@ subroutine sub6() ! Deallocation is done with a runtime call. ! CHECK-LABEL: func.func @_QMmod1Psub6() -! FIR: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "t", uniq_name = "_QMmod1Fsub6Et"} +! FIR: %[[BOX:.*]] = fir.alloca !fir.box]?}}>>> {bindc_name = "t", uniq_name = "_QMmod1Fsub6Et"} ! HLFIR: %[[BOX:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub6Et" -! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]]{{[#1]*}} : (!fir.ref>>>) -> !fir.ref> +! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]]{{[#1]*}} : (!fir.ref]?}}>>>>) -> !fir.ref> ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 -! CHECK: fir.call @sub7(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref>>>) -> () +! CHECK: fir.call @sub7(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref]?}}>>>>) -> () subroutine sub8() integer, allocatable :: a(:) diff --git a/flang/test/Semantics/offsets04.f90 b/flang/test/Semantics/offsets04.f90 new file mode 100644 index 0000000000000..d0d871a981c17 --- /dev/null +++ b/flang/test/Semantics/offsets04.f90 @@ -0,0 +1,105 @@ +!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s + +!REQUIRES: target={{.+}}-aix{{.*}} + +! Size and alignment of bind(c) derived types +subroutine s1() + use, intrinsic :: iso_c_binding + type, bind(c) :: dt1 + character(c_char) :: x1 !CHECK: x1 size=1 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=4: + end type + type, bind(c) :: dt2 + character(c_char) :: x1(9) !CHECK: x1 size=9 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=12: + end type + type, bind(c) :: dt3 + integer(c_short) :: x1 !CHECK: x1 size=2 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=4: + end type + type, bind(c) :: dt4 + integer(c_int) :: x1 !CHECK: x1 size=4 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=4: + end type + type, bind(c) :: dt5 + real(c_double) :: x1 !CHECK: x1 size=8 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=8: + end type + type, bind(c) :: dt6 + integer(c_long) :: x1 !CHECK: x1 size=8 offset=0: + character(c_char) :: x2 !CHECK: x2 size=1 offset=8: + real(c_double) :: x3 !CHECK: x3 size=8 offset=12: + end type + type, bind(c) :: dt7 + integer(c_long) :: x1 !CHECK: x1 size=8 offset=0: + integer(c_long) :: x2 !CHECK: x2 size=8 offset=8: + character(c_char) :: x3 !CHECK: x3 size=1 offset=16: + real(c_double) :: x4 !CHECK: x4 size=8 offset=20: + end type + type, bind(c) :: dt8 + character(c_char) :: x1 !CHECK: x1 size=1 offset=0: + complex(c_double_complex) :: x2 !CHECK: x2 size=16 offset=4: + end type +end subroutine + +subroutine s2() + use, intrinsic :: iso_c_binding + type, bind(c) :: dt10 + character(c_char) :: x1 + real(c_double) :: x2 + end type + type, bind(c) :: dt11 + type(dt10) :: y1 !CHECK: y1 size=12 offset=0: + real(c_double) :: y2 !CHECK: y2 size=8 offset=12: + end type + type, bind(c) :: dt12 + character(c_char) :: y1 !CHECK: y1 size=1 offset=0: + type(dt10) :: y2 !CHECK: y2 size=12 offset=4: + character(c_char) :: y3 !CHECK: y3 size=1 offset=16: + end type + type, bind(c) :: dt13 + integer(c_short) :: y1 !CHECK: y1 size=2 offset=0: + type(dt10) :: y2 !CHECK: y2 size=12 offset=4: + character(c_char) :: y3 !CHECK: y3 size=1 offset=16: + end type + + type, bind(c) :: dt20 + character(c_char) :: x1 + integer(c_short) :: x2 + end type + type, bind(c) :: dt21 + real(c_double) :: y1 !CHECK: y1 size=8 offset=0: + type(dt20) :: y2 !CHECK: y2 size=4 offset=8: + real(c_double) :: y3 !CHECK: y3 size=8 offset=12: + end type + + type, bind(c) :: dt30 + character(c_char) :: x1 + character(c_char) :: x2 + end type + type, bind(c) :: dt31 + integer(c_long) :: y1 !CHECK: y1 size=8 offset=0: + type(dt30) :: y2 !CHECK: y2 size=2 offset=8: + real(c_double) :: y3 !CHECK: y3 size=8 offset=12: + end type + + type, bind(c) :: dt40 + integer(c_short) :: x1 + real(c_double) :: x2 + end type + type, bind(c) :: dt41 + real(c_double) :: y1 !CHECK: y1 size=8 offset=0: + type(dt40) :: y2 !CHECK: y2 size=12 offset=8: + real(c_double) :: y3 !CHECK: y3 size=8 offset=20: + end type + + type, bind(c) :: dt50 + integer(c_short) :: x1 + complex(c_double_complex) :: x2 + end type + type, bind(c) :: dt51 + real(c_double) :: y1 !CHECK: y1 size=8 offset=0: + type(dt50) :: y2 !CHECK: y2 size=20 offset=8: + complex(c_double_complex) :: y3 !CHECK: y3 size=16 offset=28: + end type +end subroutine diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index e7b049c0a6638..723853b2230ae 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -721,6 +721,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.sinhf16 libc.src.math.sinpif16 libc.src.math.sqrtf16 + libc.src.math.tanf16 libc.src.math.tanhf16 libc.src.math.tanpif16 libc.src.math.totalorderf16 diff --git a/libc/docs/headers/math/index.rst b/libc/docs/headers/math/index.rst index 2808165ad539b..8548e4a5773bc 100644 --- a/libc/docs/headers/math/index.rst +++ b/libc/docs/headers/math/index.rst @@ -346,7 +346,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | sqrt | |check| | |check| | |check| | |check| | |check| | 7.12.7.10 | F.10.4.10 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| tan | |check| | |check| | | | | 7.12.4.7 | F.10.1.7 | +| tan | |check| | |check| | | |check| | | 7.12.4.7 | F.10.1.7 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | tanh | |check| | | | |check| | | 7.12.5.6 | F.10.2.6 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/include/llvm-libc-types/cfloat128.h b/libc/include/llvm-libc-types/cfloat128.h index f76a0c1c2f5af..83fad87910137 100644 --- a/libc/include/llvm-libc-types/cfloat128.h +++ b/libc/include/llvm-libc-types/cfloat128.h @@ -18,22 +18,24 @@ // // TODO: Update the complex variant of C23 `_Float128` type detection again when // clang supports it. -#if defined(__STDC_IEC_60559_COMPLEX__) && !defined(__clang__) -#if !defined(__cplusplus) -#define LIBC_TYPES_HAS_CFLOAT128 -typedef _Complex _Float128 cfloat128; -#elif defined(__GNUC__) && __GNUC__ >= 13 -#define LIBC_TYPES_HAS_CFLOAT128 -typedef _Complex _Float128 cfloat128; -#endif -#elif __clang_major__ >= 11 && \ +#ifdef __clang__ +#if (__clang_major__ >= 11) && \ (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) // Use _Complex __float128 type. clang uses __SIZEOF_FLOAT128__ or __FLOAT128__ // macro to notify the availability of __float128 type: // https://reviews.llvm.org/D15120 #define LIBC_TYPES_HAS_CFLOAT128 typedef _Complex __float128 cfloat128; -#elif (LDBL_MANT_DIG == 113) +#endif +#elif defined(__GNUC__) +#if (defined(__STDC_IEC_60559_COMPLEX__) || defined(__SIZEOF_FLOAT128__)) && \ + (__GNUC__ >= 13 || (!defined(__cplusplus))) +#define LIBC_TYPES_HAS_CFLOAT128 +typedef _Complex _Float128 cfloat128; +#endif +#endif + +#if !defined(LIBC_TYPES_HAS_CFLOAT128) && (LDBL_MANT_DIG == 113) #define LIBC_TYPES_HAS_CFLOAT128 #define LIBC_TYPES_CFLOAT128_IS_COMPLEX_LONG_DOUBLE typedef _Complex long double cfloat128; diff --git a/libc/include/math.yaml b/libc/include/math.yaml index 831d045745677..3a660a59d3605 100644 --- a/libc/include/math.yaml +++ b/libc/include/math.yaml @@ -2418,6 +2418,13 @@ functions: return_type: float arguments: - type: float + - name: tanf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: tanhf standards: - stdc diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index e4e2c49642f2d..fe5ebd793b40a 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -501,6 +501,7 @@ add_math_entrypoint_object(sqrtf128) add_math_entrypoint_object(tan) add_math_entrypoint_object(tanf) +add_math_entrypoint_object(tanf16) add_math_entrypoint_object(tanh) add_math_entrypoint_object(tanhf) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 382f5b362e2eb..0e57051807b33 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -661,6 +661,25 @@ add_entrypoint_object( ${libc_opt_high_flag} ) +add_entrypoint_object( + tanf16 + SRCS + tanf16.cpp + HDRS + ../tanf16.h + DEPENDS + .sincosf16_utils + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.multiply_add + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.types +) + add_entrypoint_object( tanpif16 SRCS diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h index 87b1dde560c5e..133896b5de7a3 100644 --- a/libc/src/math/generic/sincosf16_utils.h +++ b/libc/src/math/generic/sincosf16_utils.h @@ -47,24 +47,23 @@ LIBC_INLINE int32_t range_reduction_sincospif16(float x, float &y) { // Recall, range reduction: // k = round(x * 32/pi) -// y = x * 32/pi - k // -// The constant 0x1.45f306dc9c883p3 is 32/pi rounded to double-precision. -// 32/pi is generated by Sollya with the following commands: -// > display = hexadecimal; -// > round(32/pi, D, RN); -// -// The precision choice of 'double' is to minimize rounding errors -// in this initial scaling step, preserving enough bits so errors accumulated -// while computing the subtraction: y = x * 32/pi - round(x * 32/pi) +// The precision choice of 'double' in the following function is to minimize +// rounding errors in this initial scaling step, +// preserving enough bits so errors accumulated while computing the subtraction: +// y = x * 32/pi - round(x * 32/pi) // are beyond the least-significant bit of single-precision used during // further intermediate computation. LIBC_INLINE int32_t range_reduction_sincosf16(float x, float &y) { - double prod = x * 0x1.45f306dc9c883p3; - double kf = fputil::nearest_integer(prod); - y = static_cast(prod - kf); + // Generated by Sollya with: + // > D(32/pi); + constexpr double THIRTYTWO_OVER_PI = 0x1.45f306dc9c883p3; - return static_cast(kf); + double prod = x * THIRTYTWO_OVER_PI; + double kd = fputil::nearest_integer(prod); + y = static_cast(prod - kd); + + return static_cast(kd); } static LIBC_INLINE void sincosf16_poly_eval(int32_t k, float y, float &sin_k, diff --git a/libc/src/math/generic/tanf16.cpp b/libc/src/math/generic/tanf16.cpp new file mode 100644 index 0000000000000..48aa51e456a8a --- /dev/null +++ b/libc/src/math/generic/tanf16.cpp @@ -0,0 +1,115 @@ +//===-- Half-precision tan(x) function ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#include "src/math/tanf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "sincosf16_utils.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/macros/optimization.h" + +namespace LIBC_NAMESPACE_DECL { + +constexpr size_t N_EXCEPTS = 9; + +constexpr fputil::ExceptValues TANF16_EXCEPTS{{ + // (input, RZ output, RU offset, RD offset, RN offset) + {0x2894, 0x2894, 1, 0, 1}, + {0x3091, 0x3099, 1, 0, 0}, + {0x3098, 0x30a0, 1, 0, 0}, + {0x55ed, 0x3911, 1, 0, 0}, + {0x607b, 0xc638, 0, 1, 1}, + {0x674e, 0x3b7d, 1, 0, 0}, + {0x6807, 0x4014, 1, 0, 1}, + {0x6f4d, 0xbe19, 0, 1, 1}, + {0x7330, 0xcb62, 0, 1, 0}, +}}; + +LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits xbits(x); + + uint16_t x_u = xbits.uintval(); + uint16_t x_abs = x_u & 0x7fff; + bool x_sign = x_u >> 15; + float xf = x; + + // Handle exceptional values + if (auto r = TANF16_EXCEPTS.lookup_odd(x_abs, x_sign); + LIBC_UNLIKELY(r.has_value())) + return r.value(); + + // |x| <= 0x1.d1p-5 + if (LIBC_UNLIKELY(x_abs <= 0x2b44)) { + // |x| <= 0x1.398p-11 + if (LIBC_UNLIKELY(x_abs <= 0x10e6)) { + // tan(+/-0) = +/-0 + if (LIBC_UNLIKELY(x_abs == 0)) + return x; + + int rounding = fputil::quick_get_round(); + + // Exhaustive tests show that, when: + // x > 0, and rounding upward or + // x < 0, and rounding downward then, + // tan(x) = x * 2^-11 + x + if ((xbits.is_pos() && rounding == FE_UPWARD) || + (xbits.is_neg() && rounding == FE_DOWNWARD)) + return fputil::cast(fputil::multiply_add(xf, 0x1.0p-11f, xf)); + return x; + } + + float xsq = xf * xf; + + // Degree-6 minimax odd polynomial of tan(x) generated by Sollya with: + // > P = fpminimax(tan(x)/x, [|0, 2, 4, 6|], [|1, SG...|], [0, pi/32]); + float result = fputil::polyeval(xsq, 0x1p0f, 0x1.555556p-2f, 0x1.110ee4p-3f, + 0x1.be80f6p-5f); + + return fputil::cast(xf * result); + } + + // tan(+/-inf) = NaN, and tan(NaN) = NaN + if (LIBC_UNLIKELY(x_abs >= 0x7c00)) { + // x = +/-inf + if (x_abs == 0x7c00) { + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_INVALID); + } + + return x + FPBits::quiet_nan().get_val(); + } + + // Range reduction: + // For |x| > pi/32, we perform range reduction as follows: + // Find k and y such that: + // x = (k + y) * pi/32; + // k is an integer, |y| < 0.5 + // + // This is done by performing: + // k = round(x * 32/pi) + // y = x * 32/pi - k + // + // Once k and y are computed, we then deduce the answer by the formula: + // tan(x) = sin(x) / cos(x) + // = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k) + float sin_k, cos_k, sin_y, cosm1_y; + sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y); + + // Note that, cosm1_y = cos_y - 1: + using fputil::multiply_add; + return fputil::cast( + multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) / + multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/tanpif16.cpp b/libc/src/math/generic/tanpif16.cpp index 67635536ee319..cf4f9917d4537 100644 --- a/libc/src/math/generic/tanpif16.cpp +++ b/libc/src/math/generic/tanpif16.cpp @@ -79,7 +79,7 @@ LLVM_LIBC_FUNCTION(float16, tanpif16, (float16 x)) { // k = round(x * 32) // y = x * 32 - k // - // Once k and y are computed, we then deduce the answer by tthe formula: + // Once k and y are computed, we then deduce the answer by the formula: // tan(x) = sin(x) / cos(x) // = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k) float xf = x; diff --git a/libc/src/math/tanf16.h b/libc/src/math/tanf16.h new file mode 100644 index 0000000000000..bf1b61e9837f7 --- /dev/null +++ b/libc/src/math/tanf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for tanf16 ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_TANF16_H +#define LLVM_LIBC_SRC_MATH_TANF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 tanf16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_TANF16_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 16e7d4957ba11..ae8518ee4b4cc 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -190,6 +190,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + tanf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + tanf16_test.cpp + DEPENDS + libc.src.math.tanf16 +) + add_fp_unittest( tanpif16_test NEED_MPFR diff --git a/libc/test/src/math/cosf16_test.cpp b/libc/test/src/math/cosf16_test.cpp index 9e4687f0325c4..b744e7817e4ba 100644 --- a/libc/test/src/math/cosf16_test.cpp +++ b/libc/test/src/math/cosf16_test.cpp @@ -17,7 +17,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; // Range: [0, Inf] static constexpr uint16_t POS_START = 0x0000U; -static constexpr uint16_t POS_STOP = 0x7c00u; +static constexpr uint16_t POS_STOP = 0x7c00U; // Range: [-Inf, 0] static constexpr uint16_t NEG_START = 0x8000U; diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 31f85a3ecfd27..e23e7f41222d4 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -121,6 +121,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + tanf16_test + SUITE + libc-math-smoke-tests + SRCS + tanf16_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.tanf16 +) + add_fp_unittest( tanpif16_test SUITE diff --git a/libc/test/src/math/smoke/tanf16_test.cpp b/libc/test/src/math/smoke/tanf16_test.cpp new file mode 100644 index 0000000000000..39d1182ba891e --- /dev/null +++ b/libc/test/src/math/smoke/tanf16_test.cpp @@ -0,0 +1,34 @@ +//===-- Unittests for tanf16 ----------------------------------------------===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#include "src/errno/libc_errno.h" +#include "src/math/tanf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcTanf16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(zero, LIBC_NAMESPACE::tanf16(zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::tanf16(neg_zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/tanf16_test.cpp b/libc/test/src/math/tanf16_test.cpp new file mode 100644 index 0000000000000..f2e874182efc1 --- /dev/null +++ b/libc/test/src/math/tanf16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for tanf16 ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/tanf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf] +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0] +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcTanf16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x, + LIBC_NAMESPACE::tanf16(x), 0.5); + } +} + +TEST_F(LlvmLibcTanf16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x, + LIBC_NAMESPACE::tanf16(x), 0.5); + } +} diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst index 228c3f3432c29..15940948655d7 100644 --- a/libcxx/docs/ReleaseNotes/20.rst +++ b/libcxx/docs/ReleaseNotes/20.rst @@ -146,6 +146,8 @@ Deprecations and Removals ``__undeclare_reachable`` have been removed from the library. These functions were never implemented in a non-trivial way, making it very unlikely that any binary depends on them. +- Non-conforming extension ``packaged_task::result_type`` is deprecated. It will be removed in LLVM 21. + Upcoming Deprecations and Removals ---------------------------------- @@ -164,6 +166,8 @@ LLVM 21 - The ``_LIBCPP_VERBOSE_ABORT_NOT_NOEXCEPT`` macro will be removed in LLVM 21, making ``std::__libcpp_verbose_abort`` unconditionally ``noexcept``. +- Non-conforming extension ``packaged_task::result_type`` will be removed in LLVM 21. + ABI Affecting Changes --------------------- diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index f7721b1047b81..f3313bf53460a 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -499,7 +499,6 @@ set(files __locale_dir/locale_base_api.h __locale_dir/locale_base_api/android.h __locale_dir/locale_base_api/bsd_locale_fallbacks.h - __locale_dir/locale_base_api/fuchsia.h __locale_dir/locale_base_api/ibm.h __locale_dir/locale_base_api/musl.h __locale_dir/locale_base_api/openbsd.h @@ -507,6 +506,9 @@ set(files __locale_dir/support/apple.h __locale_dir/support/bsd_like.h __locale_dir/support/freebsd.h + __locale_dir/support/fuchsia.h + __locale_dir/support/no_locale/characters.h + __locale_dir/support/no_locale/strtonum.h __locale_dir/support/windows.h __math/abs.h __math/copysign.h @@ -547,7 +549,6 @@ set(files __memory/array_cookie.h __memory/assume_aligned.h __memory/auto_ptr.h - __memory/builtin_new_allocator.h __memory/compressed_pair.h __memory/concepts.h __memory/construct_at.h @@ -880,6 +881,7 @@ set(files __utility/cmp.h __utility/convert_to_integral.h __utility/declval.h + __utility/element_count.h __utility/empty.h __utility/exception_guard.h __utility/exchange.h diff --git a/libcxx/include/__cxx03/__config b/libcxx/include/__cxx03/__config index 3e8f181664c97..880d14a50a052 100644 --- a/libcxx/include/__cxx03/__config +++ b/libcxx/include/__cxx03/__config @@ -230,7 +230,7 @@ _LIBCPP_HARDENING_MODE_DEBUG # endif # if defined(__MVS__) -# include <__cxx03/features.h> // for __NATIVE_ASCII_F +# include // for __NATIVE_ASCII_F # endif # if defined(_WIN32) diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h index 9fe84250b1204..ab53b7a285ca4 100644 --- a/libcxx/include/__flat_map/flat_map.h +++ b/libcxx/include/__flat_map/flat_map.h @@ -17,7 +17,7 @@ #include <__algorithm/ranges_inplace_merge.h> #include <__algorithm/ranges_lower_bound.h> #include <__algorithm/ranges_partition_point.h> -#include <__algorithm/ranges_stable_sort.h> +#include <__algorithm/ranges_sort.h> #include <__algorithm/ranges_unique.h> #include <__algorithm/ranges_upper_bound.h> #include <__algorithm/remove_if.h> @@ -853,9 +853,7 @@ class flat_map { // is no invariant state to preserve _LIBCPP_HIDE_FROM_ABI void __sort_and_unique() { auto __zv = ranges::views::zip(__containers_.keys, __containers_.values); - // To be consistent with std::map's behaviour, we use stable_sort instead of sort. - // As a result, if there are duplicated keys, the first value in the original order will be taken. - ranges::stable_sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); }); + ranges::sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); }); auto __dup_start = ranges::unique(__zv, __key_equiv(__compare_)).begin(); auto __dist = ranges::distance(__zv.begin(), __dup_start); __containers_.keys.erase(__containers_.keys.begin() + __dist, __containers_.keys.end()); @@ -886,7 +884,7 @@ class flat_map { return __compare_(std::get<0>(__p1), std::get<0>(__p2)); }; if constexpr (!_WasSorted) { - ranges::stable_sort(__zv.begin() + __append_start_offset, __end, __compare_key); + ranges::sort(__zv.begin() + __append_start_offset, __end, __compare_key); } else { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT( __is_sorted_and_unique(__containers_.keys | ranges::views::drop(__append_start_offset)), diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h index 2924f6cad6578..08cb731be9725 100644 --- a/libcxx/include/__functional/function.h +++ b/libcxx/include/__functional/function.h @@ -22,7 +22,6 @@ #include <__memory/allocator.h> #include <__memory/allocator_destructor.h> #include <__memory/allocator_traits.h> -#include <__memory/builtin_new_allocator.h> #include <__memory/compressed_pair.h> #include <__memory/unique_ptr.h> #include <__type_traits/aligned_storage.h> @@ -193,6 +192,13 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)> { } }; +template +struct __deallocating_deleter { + _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const { + std::__libcpp_deallocate<_Tp>(static_cast<_Tp*>(__p), __element_count(1)); + } +}; + template class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> { _Fp __f_; @@ -212,8 +218,9 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> { } _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const { - __builtin_new_allocator::__holder_t __hold = __builtin_new_allocator::__allocate_type<__default_alloc_func>(1); - __default_alloc_func* __res = ::new ((void*)__hold.get()) __default_alloc_func(__f_); + using _Self = __default_alloc_func; + unique_ptr<_Self, __deallocating_deleter<_Self>> __hold(std::__libcpp_allocate<_Self>(__element_count(1))); + _Self* __res = ::new ((void*)__hold.get()) _Self(__f_); (void)__hold.release(); return __res; } @@ -222,7 +229,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> { _LIBCPP_HIDE_FROM_ABI static void __destroy_and_delete(__default_alloc_func* __f) { __f->destroy(); - __builtin_new_allocator::__deallocate_type<__default_alloc_func>(__f, 1); + std::__libcpp_deallocate<__default_alloc_func>(__f, __element_count(1)); } }; @@ -668,8 +675,8 @@ class __policy_func<_Rp(_ArgTypes...)> { if (__use_small_storage<_Fun>()) { ::new ((void*)&__buf_.__small) _Fun(std::move(__f)); } else { - __builtin_new_allocator::__holder_t __hold = __builtin_new_allocator::__allocate_type<_Fun>(1); - __buf_.__large = ::new ((void*)__hold.get()) _Fun(std::move(__f)); + unique_ptr<_Fun, __deallocating_deleter<_Fun>> __hold(std::__libcpp_allocate<_Fun>(__element_count(1))); + __buf_.__large = ::new ((void*)__hold.get()) _Fun(std::move(__f)); (void)__hold.release(); } } diff --git a/libcxx/include/__locale b/libcxx/include/__locale index 01c3a2e3456ba..e10eb62fb844b 100644 --- a/libcxx/include/__locale +++ b/libcxx/include/__locale @@ -348,7 +348,7 @@ public: # define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA #elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) # ifdef __APPLE__ - typedef __uint32_t mask; + typedef uint32_t mask; # elif defined(__FreeBSD__) typedef unsigned long mask; # elif defined(__NetBSD__) diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h index bb0da889f4c84..b112a4aef7765 100644 --- a/libcxx/include/__locale_dir/locale_base_api.h +++ b/libcxx/include/__locale_dir/locale_base_api.h @@ -99,6 +99,8 @@ # include <__locale_dir/support/freebsd.h> #elif defined(_LIBCPP_MSVCRT_LIKE) # include <__locale_dir/support/windows.h> +#elif defined(__Fuchsia__) +# include <__locale_dir/support/fuchsia.h> #else // TODO: This is a temporary definition to bridge between the old way we defined the locale base API @@ -111,8 +113,6 @@ # include <__locale_dir/locale_base_api/android.h> # elif defined(__OpenBSD__) # include <__locale_dir/locale_base_api/openbsd.h> -# elif defined(__Fuchsia__) -# include <__locale_dir/locale_base_api/fuchsia.h> # elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC # include <__locale_dir/locale_base_api/musl.h> # endif diff --git a/libcxx/include/__locale_dir/locale_base_api/fuchsia.h b/libcxx/include/__locale_dir/locale_base_api/fuchsia.h deleted file mode 100644 index f6ef454ba7ada..0000000000000 --- a/libcxx/include/__locale_dir/locale_base_api/fuchsia.h +++ /dev/null @@ -1,18 +0,0 @@ -// -*- C++ -*- -//===-----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H -#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H - -#include <__support/xlocale/__posix_l_fallback.h> -#include <__support/xlocale/__strtonum_fallback.h> -#include -#include - -#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H diff --git a/libcxx/include/__locale_dir/support/fuchsia.h b/libcxx/include/__locale_dir/support/fuchsia.h new file mode 100644 index 0000000000000..4a54896c8e268 --- /dev/null +++ b/libcxx/include/__locale_dir/support/fuchsia.h @@ -0,0 +1,143 @@ +//===-----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H +#define _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H + +#include <__config> +#include <__utility/forward.h> +#include // uselocale & friends +#include +#include +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD +namespace __locale { + +struct __locale_guard { + _LIBCPP_HIDE_FROM_ABI __locale_guard(locale_t& __loc) : __old_loc_(::uselocale(__loc)) {} + + _LIBCPP_HIDE_FROM_ABI ~__locale_guard() { + if (__old_loc_) + ::uselocale(__old_loc_); + } + + locale_t __old_loc_; + + __locale_guard(__locale_guard const&) = delete; + __locale_guard& operator=(__locale_guard const&) = delete; +}; + +// +// Locale management +// +using __locale_t = locale_t; + +inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) { + return ::newlocale(__category_mask, __name, __loc); +} + +inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { ::freelocale(__loc); } + +inline _LIBCPP_HIDE_FROM_ABI lconv* __localeconv(__locale_t& __loc) { + __locale_guard __current(__loc); + return std::localeconv(); +} + +// +// Other functions +// +inline _LIBCPP_HIDE_FROM_ABI decltype(MB_CUR_MAX) __mb_len_max(__locale_t __loc) { + __locale_guard __current(__loc); + return MB_CUR_MAX; +} +#if _LIBCPP_HAS_WIDE_CHARACTERS +inline _LIBCPP_HIDE_FROM_ABI wint_t __btowc(int __ch, __locale_t __loc) { + __locale_guard __current(__loc); + return std::btowc(__ch); +} +inline _LIBCPP_HIDE_FROM_ABI int __wctob(wint_t __ch, __locale_t __loc) { + __locale_guard __current(__loc); + return std::wctob(__ch); +} +inline _LIBCPP_HIDE_FROM_ABI size_t +__wcsnrtombs(char* __dest, const wchar_t** __src, size_t __nwc, size_t __len, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return ::wcsnrtombs(__dest, __src, __nwc, __len, __ps); // non-standard +} +inline _LIBCPP_HIDE_FROM_ABI size_t __wcrtomb(char* __s, wchar_t __ch, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return std::wcrtomb(__s, __ch, __ps); +} +inline _LIBCPP_HIDE_FROM_ABI size_t +__mbsnrtowcs(wchar_t* __dest, const char** __src, size_t __nms, size_t __len, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return ::mbsnrtowcs(__dest, __src, __nms, __len, __ps); // non-standard +} +inline _LIBCPP_HIDE_FROM_ABI size_t +__mbrtowc(wchar_t* __pwc, const char* __s, size_t __n, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return std::mbrtowc(__pwc, __s, __n, __ps); +} +inline _LIBCPP_HIDE_FROM_ABI int __mbtowc(wchar_t* __pwc, const char* __pmb, size_t __max, __locale_t __loc) { + __locale_guard __current(__loc); + return std::mbtowc(__pwc, __pmb, __max); +} +inline _LIBCPP_HIDE_FROM_ABI size_t __mbrlen(const char* __s, size_t __n, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return std::mbrlen(__s, __n, __ps); +} +inline _LIBCPP_HIDE_FROM_ABI size_t +__mbsrtowcs(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return ::mbsrtowcs(__dest, __src, __len, __ps); +} +#endif + +_LIBCPP_DIAGNOSTIC_PUSH +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat") +_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wformat-nonliteral") // GCC doesn't support [[gnu::format]] on variadic templates +#ifdef _LIBCPP_COMPILER_CLANG_BASED +# define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) _LIBCPP_ATTRIBUTE_FORMAT(__VA_ARGS__) +#else +# define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) /* nothing */ +#endif + +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __snprintf( + char* __s, size_t __n, __locale_t __loc, const char* __format, _Args&&... __args) { + __locale_guard __current(__loc); + return std::snprintf(__s, __n, __format, std::forward<_Args>(__args)...); +} +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __asprintf( + char** __s, __locale_t __loc, const char* __format, _Args&&... __args) { + __locale_guard __current(__loc); + return ::asprintf(__s, __format, std::forward<_Args>(__args)...); // non-standard +} +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __sscanf( + const char* __s, __locale_t __loc, const char* __format, _Args&&... __args) { + __locale_guard __current(__loc); + return std::sscanf(__s, __format, std::forward<_Args>(__args)...); +} + +_LIBCPP_DIAGNOSTIC_POP +#undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT + +} // namespace __locale +_LIBCPP_END_NAMESPACE_STD + +#include <__locale_dir/support/no_locale/characters.h> +#include <__locale_dir/support/no_locale/strtonum.h> + +#endif // _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H diff --git a/libcxx/include/__locale_dir/support/no_locale/characters.h b/libcxx/include/__locale_dir/support/no_locale/characters.h new file mode 100644 index 0000000000000..20e45fc350e2e --- /dev/null +++ b/libcxx/include/__locale_dir/support/no_locale/characters.h @@ -0,0 +1,98 @@ +//===-----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H +#define _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H + +#include <__config> +#include <__cstddef/size_t.h> +#include +#include +#include +#include +#if _LIBCPP_HAS_WIDE_CHARACTERS +# include +#endif + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD +namespace __locale { + +// +// Character manipulation functions +// +inline _LIBCPP_HIDE_FROM_ABI int __islower(int __c, __locale_t) { return std::islower(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __isupper(int __c, __locale_t) { return std::isupper(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t) { return std::isdigit(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t) { return std::isxdigit(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t) { return std::toupper(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __tolower(int __c, __locale_t) { return std::tolower(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __strcoll(const char* __s1, const char* __s2, __locale_t) { + return std::strcoll(__s1, __s2); +} + +inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, size_t __n, __locale_t) { + return std::strxfrm(__dest, __src, __n); +} + +#if _LIBCPP_HAS_WIDE_CHARACTERS +inline _LIBCPP_HIDE_FROM_ABI int __iswctype(wint_t __c, wctype_t __type, __locale_t) { + return std::iswctype(__c, __type); +} + +inline _LIBCPP_HIDE_FROM_ABI int __iswspace(wint_t __c, __locale_t) { return std::iswspace(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswprint(wint_t __c, __locale_t) { return std::iswprint(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswcntrl(wint_t __c, __locale_t) { return std::iswcntrl(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswupper(wint_t __c, __locale_t) { return std::iswupper(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswlower(wint_t __c, __locale_t) { return std::iswlower(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswalpha(wint_t __c, __locale_t) { return std::iswalpha(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswblank(wint_t __c, __locale_t) { return std::iswblank(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswdigit(wint_t __c, __locale_t) { return std::iswdigit(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswpunct(wint_t __c, __locale_t) { return std::iswpunct(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswxdigit(wint_t __c, __locale_t) { return std::iswxdigit(__c); } + +inline _LIBCPP_HIDE_FROM_ABI wint_t __towupper(wint_t __c, __locale_t) { return std::towupper(__c); } + +inline _LIBCPP_HIDE_FROM_ABI wint_t __towlower(wint_t __c, __locale_t) { return std::towlower(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __ws1, const wchar_t* __ws2, __locale_t) { + return std::wcscoll(__ws1, __ws2); +} + +inline _LIBCPP_HIDE_FROM_ABI size_t __wcsxfrm(wchar_t* __dest, const wchar_t* __src, size_t __n, __locale_t) { + return std::wcsxfrm(__dest, __src, __n); +} +#endif // _LIBCPP_HAS_WIDE_CHARACTERS + +inline _LIBCPP_HIDE_FROM_ABI size_t +__strftime(char* __s, size_t __max, const char* __format, const struct tm* __tm, __locale_t) { + return std::strftime(__s, __max, __format, __tm); +} + +} // namespace __locale +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H diff --git a/libcxx/include/__locale_dir/support/no_locale/strtonum.h b/libcxx/include/__locale_dir/support/no_locale/strtonum.h new file mode 100644 index 0000000000000..0e7a32993e736 --- /dev/null +++ b/libcxx/include/__locale_dir/support/no_locale/strtonum.h @@ -0,0 +1,49 @@ +//===-----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H +#define _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H + +#include <__config> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD +namespace __locale { + +// +// Strtonum functions +// +inline _LIBCPP_HIDE_FROM_ABI float __strtof(const char* __nptr, char** __endptr, __locale_t) { + return std::strtof(__nptr, __endptr); +} + +inline _LIBCPP_HIDE_FROM_ABI double __strtod(const char* __nptr, char** __endptr, __locale_t) { + return std::strtod(__nptr, __endptr); +} + +inline _LIBCPP_HIDE_FROM_ABI long double __strtold(const char* __nptr, char** __endptr, __locale_t) { + return std::strtold(__nptr, __endptr); +} + +inline _LIBCPP_HIDE_FROM_ABI long long __strtoll(const char* __nptr, char** __endptr, int __base, __locale_t) { + return std::strtoll(__nptr, __endptr, __base); +} + +inline _LIBCPP_HIDE_FROM_ABI unsigned long long +__strtoull(const char* __nptr, char** __endptr, int __base, __locale_t) { + return std::strtoull(__nptr, __endptr, __base); +} + +} // namespace __locale +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h index a7066885a978a..191a59e6614a0 100644 --- a/libcxx/include/__memory/allocator.h +++ b/libcxx/include/__memory/allocator.h @@ -102,7 +102,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if::v if (__libcpp_is_constant_evaluated()) { return static_cast<_Tp*>(::operator new(__n * sizeof(_Tp))); } else { - return static_cast<_Tp*>(std::__libcpp_allocate(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp))); + return std::__libcpp_allocate<_Tp>(__element_count(__n)); } } @@ -117,7 +117,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if::v if (__libcpp_is_constant_evaluated()) { ::operator delete(__p); } else { - std::__libcpp_deallocate((void*)__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)); + std::__libcpp_deallocate<_Tp>(__p, __element_count(__n)); } } diff --git a/libcxx/include/__memory/builtin_new_allocator.h b/libcxx/include/__memory/builtin_new_allocator.h deleted file mode 100644 index cde1a6025a9a7..0000000000000 --- a/libcxx/include/__memory/builtin_new_allocator.h +++ /dev/null @@ -1,67 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H -#define _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H - -#include <__config> -#include <__cstddef/size_t.h> -#include <__memory/unique_ptr.h> -#include <__new/allocate.h> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -// __builtin_new_allocator -- A non-templated helper for allocating and -// deallocating memory using __builtin_operator_new and -// __builtin_operator_delete. It should be used in preference to -// `std::allocator` to avoid additional instantiations. -struct __builtin_new_allocator { - struct __builtin_new_deleter { - typedef void* pointer_type; - - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __builtin_new_deleter(size_t __size, size_t __align) - : __size_(__size), __align_(__align) {} - - _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const _NOEXCEPT { - std::__libcpp_deallocate(__p, __size_, __align_); - } - - private: - size_t __size_; - size_t __align_; - }; - - typedef unique_ptr __holder_t; - - _LIBCPP_HIDE_FROM_ABI static __holder_t __allocate_bytes(size_t __s, size_t __align) { - return __holder_t(std::__libcpp_allocate(__s, __align), __builtin_new_deleter(__s, __align)); - } - - _LIBCPP_HIDE_FROM_ABI static void __deallocate_bytes(void* __p, size_t __s, size_t __align) _NOEXCEPT { - std::__libcpp_deallocate(__p, __s, __align); - } - - template - _LIBCPP_NODEBUG _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI static __holder_t __allocate_type(size_t __n) { - return __allocate_bytes(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)); - } - - template - _LIBCPP_NODEBUG _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI static void - __deallocate_type(void* __p, size_t __n) _NOEXCEPT { - __deallocate_bytes(__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)); - } -}; - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H diff --git a/libcxx/include/__memory/unique_temporary_buffer.h b/libcxx/include/__memory/unique_temporary_buffer.h index dea7fa8e18728..32a3f0f081c00 100644 --- a/libcxx/include/__memory/unique_temporary_buffer.h +++ b/libcxx/include/__memory/unique_temporary_buffer.h @@ -40,7 +40,7 @@ struct __temporary_buffer_deleter { return; } - std::__libcpp_deallocate_unsized((void*)__ptr, _LIBCPP_ALIGNOF(_Tp)); + std::__libcpp_deallocate_unsized<_Tp>(__ptr); } }; diff --git a/libcxx/include/__new/allocate.h b/libcxx/include/__new/allocate.h index 71dffc1776eff..a64663c09fa35 100644 --- a/libcxx/include/__new/allocate.h +++ b/libcxx/include/__new/allocate.h @@ -14,6 +14,8 @@ #include <__cstddef/size_t.h> #include <__new/align_val_t.h> #include <__new/global_new_delete.h> // for _LIBCPP_HAS_SIZED_DEALLOCATION +#include <__type_traits/type_identity.h> +#include <__utility/element_count.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -47,52 +49,58 @@ _LIBCPP_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) _NOEXCEPT { #endif } -inline _LIBCPP_HIDE_FROM_ABI void* __libcpp_allocate(size_t __size, size_t __align) { +template +inline _LIBCPP_HIDE_FROM_ABI _Tp* __libcpp_allocate(__element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) { + size_t __size = static_cast(__n) * sizeof(_Tp); #if _LIBCPP_HAS_ALIGNED_ALLOCATION if (__is_overaligned_for_new(__align)) { const align_val_t __align_val = static_cast(__align); - return __libcpp_operator_new(__size, __align_val); + return static_cast<_Tp*>(std::__libcpp_operator_new(__size, __align_val)); } #endif (void)__align; - return __libcpp_operator_new(__size); + return static_cast<_Tp*>(std::__libcpp_operator_new(__size)); } -template -_LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args) _NOEXCEPT { -#if !_LIBCPP_HAS_SIZED_DEALLOCATION - (void)__size; - return std::__libcpp_operator_delete(__ptr, __args...); +#if _LIBCPP_HAS_SIZED_DEALLOCATION +# define _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(...) __VA_ARGS__ #else - return std::__libcpp_operator_delete(__ptr, __size, __args...); +# define _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(...) /* nothing */ #endif -} -inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) _NOEXCEPT { +template +inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate( + __type_identity_t<_Tp>* __ptr, __element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT { + size_t __size = static_cast(__n) * sizeof(_Tp); + (void)__size; #if !_LIBCPP_HAS_ALIGNED_ALLOCATION (void)__align; - return __do_deallocate_handle_size(__ptr, __size); + return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size)); #else if (__is_overaligned_for_new(__align)) { const align_val_t __align_val = static_cast(__align); - return __do_deallocate_handle_size(__ptr, __size, __align_val); + return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size), __align_val); } else { - return __do_deallocate_handle_size(__ptr, __size); + return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size)); } #endif } -inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align) _NOEXCEPT { +#undef _LIBCPP_ONLY_IF_SIZED_DEALLOCATION + +template +inline _LIBCPP_HIDE_FROM_ABI void +__libcpp_deallocate_unsized(__type_identity_t<_Tp>* __ptr, size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT { #if !_LIBCPP_HAS_ALIGNED_ALLOCATION (void)__align; - return __libcpp_operator_delete(__ptr); + return std::__libcpp_operator_delete(__ptr); #else if (__is_overaligned_for_new(__align)) { const align_val_t __align_val = static_cast(__align); - return __libcpp_operator_delete(__ptr, __align_val); + return std::__libcpp_operator_delete(__ptr, __align_val); } else { - return __libcpp_operator_delete(__ptr); + return std::__libcpp_operator_delete(__ptr); } #endif } diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h index f50eac34a1c05..0bc128b68b579 100644 --- a/libcxx/include/__string/constexpr_c_functions.h +++ b/libcxx/include/__string/constexpr_c_functions.h @@ -25,6 +25,7 @@ #include <__type_traits/is_trivially_copyable.h> #include <__type_traits/is_trivially_lexicographically_comparable.h> #include <__type_traits/remove_cv.h> +#include <__utility/element_count.h> #include <__utility/is_pointer_in_range.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -33,10 +34,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD -// Type used to encode that a function takes an integer that represents a number -// of elements as opposed to a number of bytes. -enum class __element_count : size_t {}; - template inline const bool __is_char_type = false; diff --git a/libcxx/include/__utility/element_count.h b/libcxx/include/__utility/element_count.h new file mode 100644 index 0000000000000..82b05a7bde483 --- /dev/null +++ b/libcxx/include/__utility/element_count.h @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___UTILITY_ELEMENT_COUNT_H +#define _LIBCPP___UTILITY_ELEMENT_COUNT_H + +#include <__config> +#include <__cstddef/size_t.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +// Type used to encode that a function takes an integer that represents a number +// of elements as opposed to a number of bytes. +enum class __element_count : size_t {}; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___UTILITY_ELEMENT_COUNT_H diff --git a/libcxx/include/__utility/small_buffer.h b/libcxx/include/__utility/small_buffer.h index ff6e7e76f14f5..132a57f0fefab 100644 --- a/libcxx/include/__utility/small_buffer.h +++ b/libcxx/include/__utility/small_buffer.h @@ -68,7 +68,7 @@ class __small_buffer { if constexpr (__fits_in_buffer<_Stored>) { return std::launder(reinterpret_cast<_Stored*>(__buffer_)); } else { - byte* __allocation = static_cast(std::__libcpp_allocate(sizeof(_Stored), alignof(_Stored))); + byte* __allocation = reinterpret_cast(std::__libcpp_allocate<_Stored>(__element_count(1))); std::construct_at(reinterpret_cast(__buffer_), __allocation); return std::launder(reinterpret_cast<_Stored*>(__allocation)); } @@ -77,7 +77,7 @@ class __small_buffer { template _LIBCPP_HIDE_FROM_ABI void __dealloc() noexcept { if constexpr (!__fits_in_buffer<_Stored>) - std::__libcpp_deallocate(*reinterpret_cast(__buffer_), sizeof(_Stored), alignof(_Stored)); + std::__libcpp_deallocate<_Stored>(__get<_Stored>(), __element_count(1)); } template diff --git a/libcxx/include/future b/libcxx/include/future index d777ed8d6016f..72f3ed5ca5d27 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -1612,11 +1612,11 @@ inline _Rp __packaged_task_function<_Rp(_ArgTypes...)>::operator()(_ArgTypes... template class _LIBCPP_TEMPLATE_VIS packaged_task<_Rp(_ArgTypes...)> { public: - typedef _Rp result_type; // extension + using result_type _LIBCPP_DEPRECATED = _Rp; // extension private: - __packaged_task_function __f_; - promise __p_; + __packaged_task_function<_Rp(_ArgTypes...)> __f_; + promise<_Rp> __p_; public: // construction and destruction @@ -1653,7 +1653,7 @@ public: _LIBCPP_HIDE_FROM_ABI bool valid() const _NOEXCEPT { return __p_.__state_ != nullptr; } // result retrieval - _LIBCPP_HIDE_FROM_ABI future get_future() { return __p_.get_future(); } + _LIBCPP_HIDE_FROM_ABI future<_Rp> get_future() { return __p_.get_future(); } // execution _LIBCPP_HIDE_FROM_ABI void operator()(_ArgTypes... __args); @@ -1700,17 +1700,17 @@ template void packaged_task<_Rp(_ArgTypes...)>::reset() { if (!valid()) __throw_future_error(future_errc::no_state); - __p_ = promise(); + __p_ = promise<_Rp>(); } template class _LIBCPP_TEMPLATE_VIS packaged_task { public: - typedef void result_type; // extension + using result_type _LIBCPP_DEPRECATED = void; // extension private: - __packaged_task_function __f_; - promise __p_; + __packaged_task_function __f_; + promise __p_; public: // construction and destruction @@ -1745,7 +1745,7 @@ public: _LIBCPP_HIDE_FROM_ABI bool valid() const _NOEXCEPT { return __p_.__state_ != nullptr; } // result retrieval - _LIBCPP_HIDE_FROM_ABI future get_future() { return __p_.get_future(); } + _LIBCPP_HIDE_FROM_ABI future get_future() { return __p_.get_future(); } // execution _LIBCPP_HIDE_FROM_ABI void operator()(_ArgTypes... __args); @@ -1804,7 +1804,7 @@ template void packaged_task::reset() { if (!valid()) __throw_future_error(future_errc::no_state); - __p_ = promise(); + __p_ = promise(); } template diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 07ab5649ae45c..69f1b7d094ada 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -760,6 +760,8 @@ module std [system] { module ranges_sort { header "__algorithm/ranges_sort.h" export std.functional.ranges_operations + export std.algorithm.sort + export std.algorithm.make_projected } module ranges_stable_partition { header "__algorithm/ranges_stable_partition.h" @@ -1478,13 +1480,15 @@ module std [system] { textual header "__locale_dir/support/apple.h" textual header "__locale_dir/support/bsd_like.h" textual header "__locale_dir/support/freebsd.h" + textual header "__locale_dir/support/fuchsia.h" + textual header "__locale_dir/support/no_locale/characters.h" + textual header "__locale_dir/support/no_locale/strtonum.h" textual header "__locale_dir/support/windows.h" } module locale_base_api { textual header "__locale_dir/locale_base_api/android.h" textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h" - textual header "__locale_dir/locale_base_api/fuchsia.h" textual header "__locale_dir/locale_base_api/ibm.h" textual header "__locale_dir/locale_base_api/musl.h" textual header "__locale_dir/locale_base_api/openbsd.h" @@ -1525,14 +1529,16 @@ module std [system] { module aligned_alloc { header "__memory/aligned_alloc.h" } module allocate_at_least { header "__memory/allocate_at_least.h" } module allocation_guard { header "__memory/allocation_guard.h" } - module allocator { header "__memory/allocator.h" } + module allocator { + header "__memory/allocator.h" + export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108 + } module allocator_arg_t { header "__memory/allocator_arg_t.h" } module allocator_destructor { header "__memory/allocator_destructor.h" } module allocator_traits { header "__memory/allocator_traits.h" } module array_cookie { header "__memory/array_cookie.h" } module assume_aligned { header "__memory/assume_aligned.h" } module auto_ptr { header "__memory/auto_ptr.h" } - module builtin_new_allocator { header "__memory/builtin_new_allocator.h" } module compressed_pair { header "__memory/compressed_pair.h" } module concepts { header "__memory/concepts.h" } module construct_at { header "__memory/construct_at.h" } @@ -1567,6 +1573,7 @@ module std [system] { header "__memory/unique_temporary_buffer.h" export std.memory.unique_ptr export std_core.type_traits.is_constant_evaluated + export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108 } module uses_allocator { header "__memory/uses_allocator.h" } module uses_allocator_construction { header "__memory/uses_allocator_construction.h" } @@ -1602,7 +1609,11 @@ module std [system] { module new { header "new" module align_val_t { header "__new/align_val_t.h" } - module allocate { header "__new/allocate.h" } + module allocate { + header "__new/allocate.h" + export std.utility.element_count // used as part of the API + export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108 + } module destroying_delete_t { header "__new/destroying_delete_t.h" } module exceptions { header "__new/exceptions.h" } module global_new_delete { @@ -1909,7 +1920,10 @@ module std [system] { module string { module char_traits { header "__string/char_traits.h" } - module constexpr_c_functions { header "__string/constexpr_c_functions.h" } + module constexpr_c_functions { + header "__string/constexpr_c_functions.h" + export std.utility.element_count // used as part of the constexpr C function's API + } module extern_template_lists { header "__string/extern_template_lists.h" } module fwd { header "__fwd/string.h" } @@ -2019,6 +2033,7 @@ module std [system] { } module cmp { header "__utility/cmp.h" } module convert_to_integral { header "__utility/convert_to_integral.h" } + module element_count { header "__utility/element_count.h" } module exception_guard { header "__utility/exception_guard.h" } module exchange { header "__utility/exchange.h" } module forward_like { header "__utility/forward_like.h" } diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp index e182e5aa66ef9..e1a9e1a8fac49 100644 --- a/libcxx/src/memory_resource.cpp +++ b/libcxx/src/memory_resource.cpp @@ -41,20 +41,22 @@ static bool is_aligned_to(void* ptr, size_t align) { class _LIBCPP_EXPORTED_FROM_ABI __new_delete_memory_resource_imp : public memory_resource { void* do_allocate(size_t bytes, size_t align) override { #if _LIBCPP_HAS_ALIGNED_ALLOCATION - return std::__libcpp_allocate(bytes, align); + return std::__libcpp_allocate(__element_count(bytes), align); #else if (bytes == 0) bytes = 1; - void* result = std::__libcpp_allocate(bytes, align); + std::byte* result = std::__libcpp_allocate(__element_count(bytes), align); if (!is_aligned_to(result, align)) { - std::__libcpp_deallocate(result, bytes, align); + std::__libcpp_deallocate(result, __element_count(bytes), align); __throw_bad_alloc(); } return result; #endif } - void do_deallocate(void* p, size_t bytes, size_t align) override { std::__libcpp_deallocate(p, bytes, align); } + void do_deallocate(void* p, size_t bytes, size_t align) override { + std::__libcpp_deallocate(static_cast(p), __element_count(bytes), align); + } bool do_is_equal(const memory_resource& other) const noexcept override { return &other == this; } }; diff --git a/libcxx/src/shared_mutex.cpp b/libcxx/src/shared_mutex.cpp index 1a346dda027f8..6180833736956 100644 --- a/libcxx/src/shared_mutex.cpp +++ b/libcxx/src/shared_mutex.cpp @@ -38,8 +38,10 @@ bool __shared_mutex_base::try_lock() { } void __shared_mutex_base::unlock() { - lock_guard _(__mut_); - __state_ = 0; + { + lock_guard _(__mut_); + __state_ = 0; + } __gate1_.notify_all(); } @@ -67,16 +69,20 @@ bool __shared_mutex_base::try_lock_shared() { } void __shared_mutex_base::unlock_shared() { - lock_guard _(__mut_); + unique_lock lk(__mut_); unsigned num_readers = (__state_ & __n_readers_) - 1; __state_ &= ~__n_readers_; __state_ |= num_readers; if (__state_ & __write_entered_) { - if (num_readers == 0) + if (num_readers == 0) { + lk.unlock(); __gate2_.notify_one(); + } } else { - if (num_readers == __n_readers_ - 1) + if (num_readers == __n_readers_ - 1) { + lk.unlock(); __gate1_.notify_one(); + } } } diff --git a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp b/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp deleted file mode 100644 index 0d90c3250061f..0000000000000 --- a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp +++ /dev/null @@ -1,68 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 - -// - -// flat_map(key_container_type key_cont, mapped_container_type mapped_cont); -// -// libc++ uses stable_sort to ensure that flat_map's behavior matches map's, -// in terms of which duplicate items are kept. -// This tests a conforming extension. - -#include -#include -#include -#include -#include -#include -#include - -#include "test_macros.h" - -struct Mod256 { - bool operator()(int x, int y) const { return (x % 256) < (y % 256); } -}; - -int main(int, char**) { - std::mt19937 randomness; - std::vector values; - std::vector> pairs; - for (int i = 0; i < 200; ++i) { - uint16_t r = randomness(); - values.push_back(r); - pairs.emplace_back(r, r); - } - - { - std::map m(pairs.begin(), pairs.end()); - std::flat_map fm(values, values); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs.begin(), pairs.end()); - std::flat_map fm(values, values, Mod256()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs.begin(), pairs.end()); - std::flat_map fm(values, values, std::allocator()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs.begin(), pairs.end()); - std::flat_map fm(values, values, Mod256(), std::allocator()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - return 0; -} diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp index b283c8aa06f0c..7ead65caf9fda 100644 --- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp +++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp @@ -96,34 +96,34 @@ struct alloc_stats { }; alloc_stats stats; -void operator delete(void* p)TEST_NOEXCEPT { +void operator delete(void* p) TEST_NOEXCEPT { ::free(p); stats.plain_called++; stats.last_size = stats.last_align = -1; } #ifndef NO_SIZE -void operator delete(void* p, std::size_t n)TEST_NOEXCEPT { +void operator delete(void* p, std::size_t n) TEST_NOEXCEPT { ::free(p); stats.sized_called++; - stats.last_size = n; + stats.last_size = n; stats.last_align = -1; } #endif #ifndef NO_ALIGN -void operator delete(void* p, std::align_val_t a)TEST_NOEXCEPT { +void operator delete(void* p, std::align_val_t a) TEST_NOEXCEPT { std::__libcpp_aligned_free(p); stats.aligned_called++; stats.last_align = static_cast(a); - stats.last_size = -1; + stats.last_size = -1; } -void operator delete(void* p, std::size_t n, std::align_val_t a)TEST_NOEXCEPT { +void operator delete(void* p, std::size_t n, std::align_val_t a) TEST_NOEXCEPT { std::__libcpp_aligned_free(p); stats.aligned_sized_called++; stats.last_align = static_cast(a); - stats.last_size = n; + stats.last_size = n; } #endif @@ -135,45 +135,45 @@ void test_libcpp_dealloc() { std::size_t over_align_val = TEST_ALIGNOF(std::max_align_t) * 2; #endif std::size_t under_align_val = TEST_ALIGNOF(int); - std::size_t with_size_val = 2; + std::size_t with_size_val = 2; { - std::__libcpp_deallocate_unsized(p, under_align_val); + std::__libcpp_deallocate_unsized(static_cast(p), under_align_val); assert(stats.expect_plain()); } stats.reset(); #if defined(NO_SIZE) && defined(NO_ALIGN) { - std::__libcpp_deallocate(p, with_size_val, over_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), over_align_val); assert(stats.expect_plain()); } stats.reset(); #elif defined(NO_SIZE) { - std::__libcpp_deallocate(p, with_size_val, over_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), over_align_val); assert(stats.expect_align(over_align_val)); } stats.reset(); #elif defined(NO_ALIGN) { - std::__libcpp_deallocate(p, with_size_val, over_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), over_align_val); assert(stats.expect_size(with_size_val)); } stats.reset(); #else { - std::__libcpp_deallocate(p, with_size_val, over_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), over_align_val); assert(stats.expect_size_align(with_size_val, over_align_val)); } stats.reset(); { - std::__libcpp_deallocate_unsized(p, over_align_val); + std::__libcpp_deallocate_unsized(static_cast(p), over_align_val); assert(stats.expect_align(over_align_val)); } stats.reset(); { - std::__libcpp_deallocate(p, with_size_val, under_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), under_align_val); assert(stats.expect_size(with_size_val)); } stats.reset(); @@ -202,13 +202,13 @@ void test_allocator_and_new_match() { stats.reset(); #elif defined(NO_SIZE) stats.reset(); -#if TEST_STD_VER >= 11 +# if TEST_STD_VER >= 11 { int* x = DoNotOptimize(new int(42)); delete x; assert(stats.expect_plain()); } -#endif +# endif stats.reset(); { AlignedType* a = DoNotOptimize(new AlignedType()); @@ -241,8 +241,7 @@ void test_allocator_and_new_match() { { AlignedType* a = DoNotOptimize(new AlignedType()); delete a; - assert(stats.expect_size_align(sizeof(AlignedType), - TEST_ALIGNOF(AlignedType))); + assert(stats.expect_size_align(sizeof(AlignedType), TEST_ALIGNOF(AlignedType))); } stats.reset(); #endif diff --git a/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp b/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp new file mode 100644 index 0000000000000..4065637e9eb2a --- /dev/null +++ b/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: no-threads +// UNSUPPORTED: c++03 + +// + +// template +// class packaged_task +// { +// public: +// typedef R result_type; // extension + +// This libc++ extension is deprecated. See https://github.com/llvm/llvm-project/issues/112856. + +#include +#include + +struct A {}; + +using RA = std::packaged_task::result_type; // expected-warning {{'result_type' is deprecated}} +using RV = std::packaged_task::result_type; // expected-warning {{'result_type' is deprecated}} diff --git a/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp b/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp index 1f17d74513471..659232caa46ec 100644 --- a/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp +++ b/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp @@ -19,16 +19,16 @@ // This is a libc++ extension. +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + #include #include -#include "test_macros.h" - struct A {}; -int main(int, char**) -{ - static_assert((std::is_same::result_type, A>::value), ""); +int main(int, char**) { + static_assert((std::is_same::result_type, A>::value), ""); + static_assert((std::is_same::result_type, void>::value), ""); return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp deleted file mode 100644 index 14189840ce660..0000000000000 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp +++ /dev/null @@ -1,66 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 - -// - -// template -// flat_map(InputIterator first, InputIterator last, const key_compare& comp = key_compare()) -// -// libc++ uses stable_sort to ensure that flat_map's behavior matches map's, -// in terms of which duplicate items are kept. -// This tests a conforming extension. - -#include -#include -#include -#include -#include -#include -#include - -#include "test_macros.h" - -struct Mod256 { - bool operator()(int x, int y) const { return (x % 256) < (y % 256); } -}; - -int main(int, char**) { - std::mt19937 randomness; - std::pair pairs[200]; - for (auto& pair : pairs) { - pair = {uint16_t(randomness()), uint16_t(randomness())}; - } - - { - std::map m(pairs, pairs + 200); - std::flat_map fm(pairs, pairs + 200); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs, pairs + 200, std::allocator()); - std::flat_map fm(pairs, pairs + 200, std::allocator()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs, pairs + 200, Mod256()); - std::flat_map fm(pairs, pairs + 200, Mod256()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs, pairs + 200, Mod256(), std::allocator()); - std::flat_map fm(pairs, pairs + 200, Mod256(), std::allocator()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - return 0; -} diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp deleted file mode 100644 index fabcb1d216a78..0000000000000 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp +++ /dev/null @@ -1,63 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 - -// - -// template R> -// void insert_range(R&& rg); -// -// libc++ uses stable_sort to ensure that flat_map's behavior matches map's, -// in terms of which duplicate items are kept. -// This tests a conforming extension. - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "test_macros.h" - -struct Mod256 { - bool operator()(int x, int y) const { return (x % 256) < (y % 256); } -}; - -int main(int, char**) { - { - std::mt19937 randomness; - std::pair pairs[400]; - for (int i = 0; i < 400; ++i) { - uint16_t r = randomness(); - pairs[i] = {r, r}; - } - - std::map m(pairs, pairs + 200); - std::flat_map fm(std::sorted_unique, m.begin(), m.end()); - assert(std::ranges::equal(fm, m)); - - fm.insert_range(std::views::counted(pairs + 200, 200)); - m.insert(pairs + 200, pairs + 400); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - - { - std::vector> v{{1, 2}, {1, 3}}; - std::flat_map m; - m.insert_range(v); - assert(m.size() == 1); - LIBCPP_ASSERT(m[1] == 2); - } - return 0; -} diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h index 2a74058673bae..7e9bc23a75acb 100644 --- a/lldb/include/lldb/Utility/ArchSpec.h +++ b/lldb/include/lldb/Utility/ArchSpec.h @@ -215,6 +215,8 @@ class ArchSpec { eCore_x86_64_x86_64, eCore_x86_64_x86_64h, // Haswell enabled x86_64 + eCore_x86_64_amd64, + eCore_hexagon_generic, eCore_hexagon_hexagonv4, eCore_hexagon_hexagonv5, diff --git a/lldb/source/Plugins/Language/ObjC/NSError.cpp b/lldb/source/Plugins/Language/ObjC/NSError.cpp index 2356bc4ef4bab..bb54044ae1d61 100644 --- a/lldb/source/Plugins/Language/ObjC/NSError.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSError.cpp @@ -66,8 +66,8 @@ bool lldb_private::formatters::NSError_SummaryProvider( lldb::addr_t domain_location = ptr_value + 3 * ptr_size; Status error; - uint64_t code = process_sp->ReadUnsignedIntegerFromMemory(code_location, - ptr_size, 0, error); + int64_t code = process_sp->ReadSignedIntegerFromMemory(code_location, + ptr_size, 0, error); if (error.Fail()) return false; @@ -77,7 +77,7 @@ bool lldb_private::formatters::NSError_SummaryProvider( return false; if (!domain_str_value) { - stream.Printf("domain: nil - code: %" PRIu64, code); + stream.Printf("domain: nil - code: %" PRIi64, code); return true; } @@ -98,11 +98,11 @@ bool lldb_private::formatters::NSError_SummaryProvider( StreamString domain_str_summary; if (NSStringSummaryProvider(*domain_str_sp, domain_str_summary, options) && !domain_str_summary.Empty()) { - stream.Printf("domain: %s - code: %" PRIu64, domain_str_summary.GetData(), + stream.Printf("domain: %s - code: %" PRIi64, domain_str_summary.GetData(), code); return true; } else { - stream.Printf("domain: nil - code: %" PRIu64, code); + stream.Printf("domain: nil - code: %" PRIi64, code); return true; } } diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp index 85bb85044ec15..b13e8ff1ec373 100644 --- a/lldb/source/Utility/ArchSpec.cpp +++ b/lldb/source/Utility/ArchSpec.cpp @@ -218,6 +218,9 @@ static const CoreDefinition g_core_definitions[] = { ArchSpec::eCore_x86_64_x86_64, "x86_64"}, {eByteOrderLittle, 8, 1, 15, llvm::Triple::x86_64, ArchSpec::eCore_x86_64_x86_64h, "x86_64h"}, + {eByteOrderLittle, 8, 1, 15, llvm::Triple::x86_64, + ArchSpec::eCore_x86_64_amd64, "amd64"}, + {eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon, ArchSpec::eCore_hexagon_generic, "hexagon"}, {eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon, @@ -1227,6 +1230,7 @@ static bool cores_match(const ArchSpec::Core core1, const ArchSpec::Core core2, break; case ArchSpec::eCore_x86_64_x86_64h: + case ArchSpec::eCore_x86_64_amd64: if (!enforce_exact_match) { try_inverse = false; if (core2 == ArchSpec::eCore_x86_64_x86_64) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py index 8a052cf84ef0e..de15e5915750b 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py @@ -23,10 +23,12 @@ def test_nserror_with_run_command_no_const(self): self.appkit_tester_impl(self.nserror_data_formatter_commands, False) def nserror_data_formatter_commands(self): - self.expect("frame variable nserror", substrs=['domain: @"Foobar" - code: 12']) + self.expect( + "frame variable nserror", substrs=['domain: @"Foobar" - code: -1234'] + ) self.expect( - "frame variable nserrorptr", substrs=['domain: @"Foobar" - code: 12'] + "frame variable nserrorptr", substrs=['domain: @"Foobar" - code: -1234'] ) self.expect("frame variable nserror->_userInfo", substrs=["2 key/value pairs"]) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m index 0ca5cf98bd3a5..314bada49303d 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m @@ -618,7 +618,7 @@ int main(int argc, const char *argv[]) { NSDictionary *error_userInfo = @{@"a" : @1, @"b" : @2}; NSError *nserror = [[NSError alloc] initWithDomain:@"Foobar" - code:12 + code:-1234 userInfo:error_userInfo]; NSError **nserrorptr = &nserror; diff --git a/lldb/unittests/Utility/ArchSpecTest.cpp b/lldb/unittests/Utility/ArchSpecTest.cpp index de3590b73bbaa..74a4b48456b01 100644 --- a/lldb/unittests/Utility/ArchSpecTest.cpp +++ b/lldb/unittests/Utility/ArchSpecTest.cpp @@ -129,6 +129,12 @@ TEST(ArchSpecTest, TestSetTriple) { EXPECT_STREQ("msp430", AS.GetArchitectureName()); EXPECT_EQ(ArchSpec::eCore_msp430, AS.GetCore()); + AS = ArchSpec(); + EXPECT_TRUE(AS.SetTriple("amd64-unknown-openbsd")); + EXPECT_EQ(llvm::Triple::x86_64, AS.GetTriple().getArch()); + EXPECT_STREQ("amd64", AS.GetArchitectureName()); + EXPECT_EQ(ArchSpec::eCore_x86_64_amd64, AS.GetCore()); + // Various flavors of invalid triples. AS = ArchSpec(); EXPECT_FALSE(AS.SetTriple("unknown-unknown-unknown")); diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst index 857d29e48363b..80e3c2c11153d 100644 --- a/llvm/docs/DirectX/DXILResources.rst +++ b/llvm/docs/DirectX/DXILResources.rst @@ -491,26 +491,28 @@ Examples: i32 %byte_offset, i32 0) -Texture and Typed Buffer Stores -------------------------------- +Stores +------ -*relevant types: Textures and TypedBuffer* +*relevant types: Textures and Buffer* -The `TextureStore`_ and `BufferStore`_ DXIL operations always write all four -32-bit components to a texture or a typed buffer. While both operations include -a mask parameter, it is specified that the mask must cover all components when -used with these types. +The `TextureStore`_, `BufferStore`_, and `RawBufferStore`_ DXIL operations +write four components to a texture or a buffer. These include a mask argument +that is used when fewer than 4 components are written, but notably this only +takes on the contiguous x, xy, xyz, and xyzw values. -The store operations that we define as intrinsics behave similarly, and will -only accept writes to the whole of the contained type. This differs from the -loads above, but this makes sense to do from a semantics preserving point of -view. Thus, texture and buffer stores may only operate on 4-element vectors of -types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and -``<4 x half>``, and 2 element vectors of 64-bit types like ``<2 x double>`` and -``<2 x i64>``. +We define the LLVM store intrinsics to accept vectors when storing multiple +components rather than using `undef` and a mask, but otherwise match the DXIL +ops fairly closely. -.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore .. _TextureStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#texturestore +.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore +.. _RawBufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferstore + +For TypedBuffer, we only need one coordinate, and we must always write a vector +since partial writes aren't possible. Similarly to the load operations +described above, we handle 64-bit types specially and only handle 2-element +vectors rather than 4. Examples: @@ -548,3 +550,85 @@ Examples: target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data) call void @llvm.dx.resource.store.typedbuffer.tdx.Buffer_v2f64_1_0_0t( target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data) + +For RawBuffer, we need two indices and we accept scalars and vectors of 4 or +fewer elements. Note that we do allow vectors of 4 64-bit elements here. + +Examples: + +.. list-table:: ``@llvm.dx.resource.store.rawbuffer`` + :header-rows: 1 + + * - Argument + - + - Type + - Description + * - Return value + - + - ``void`` + - + * - ``%buffer`` + - 0 + - ``target(dx.RawBuffer, ...)`` + - The buffer to store into + * - ``%index`` + - 1 + - ``i32`` + - Index into the buffer + * - ``%offset`` + - 2 + - ``i32`` + - Byte offset into structured buffer elements + * - ``%data`` + - 3 + - Scalar or vector + - The data to store + +Examples: + +.. code-block:: llvm + + ; float + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_f32_1_0_0t.f32( + target("dx.RawBuffer", float, 1, 0, 0) %buffer, + i32 %index, i32 0, float %data) + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %index, i32 0, float %data) + + ; float4 + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v4f32_1_0_0t.v4f32( + target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data) + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.v4f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data) + + ; struct S0 { float4 f; int4 i; } + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data0) + call void @llvm.dx.resource.store.rawbuffer.v4i32( + target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 1, 0, 0) %buffer, + i32 %index, i32 16, <4 x i32> %data1) + + ; struct Q { float4 f; int3 i; } + ; struct R { int z; S x; } + call void @llvm.dx.resource.store.rawbuffer.i32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + %buffer, + i32 %index, i32 0, i32 %data0) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + %buffer, + i32 %index, i32 4, <4 x float> %data1) + call void @llvm.dx.resource.store.rawbuffer.v3f16( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + %buffer, + i32 %index, i32 20, <3 x half> %data2) + + ; byteaddressbuf.Store + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.v4f64( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x double> %data) + diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 33acb5e73d5ff..8cc9036d1b67f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -1397,6 +1397,42 @@ Currently, only the following parameter attributes are defined: function, returning a pointer to allocated storage disjoint from the storage for any other object accessible to the caller. +``captures(...)`` + This attributes restrict the ways in which the callee may capture the + pointer. This is not a valid attribute for return values. This attribute + applies only to the particular copy of the pointer passed in this argument. + + The arguments of ``captures`` is a list of captured pointer components, + which may be ``none``, or a combination of: + + - ``address``: The integral address of the pointer. + - ``address_is_null`` (subset of ``address``): Whether the address is null. + - ``provenance``: The ability to access the pointer for both read and write + after the function returns. + - ``read_provenance`` (subset of ``provenance``): The ability to access the + pointer only for reads after the function returns. + + Additionally, it is possible to specify that some components are only + captured in certain locations. Currently only the return value (``ret``) + and other (default) locations are supported. + + The `pointer capture section ` discusses these semantics + in more detail. + + Some examples of how to use the attribute: + + - ``captures(none)``: Pointer not captured. + - ``captures(address, provenance)``: Equivalent to omitting the attribute. + - ``captures(address)``: Address may be captured, but not provenance. + - ``captures(address_is_null)``: Only captures whether the address is null. + - ``captures(address, read_provenance)``: Both address and provenance + captured, but only for read-only access. + - ``captures(ret: address, provenance)``: Pointer captured through return + value only. + - ``captures(address_is_null, ret: address, provenance)``: The whole pointer + is captured through the return value, and additionally whether the pointer + is null is captured in some other way. + .. _nocapture: ``nocapture`` @@ -3339,10 +3375,92 @@ Pointer Capture --------------- Given a function call and a pointer that is passed as an argument or stored in -the memory before the call, a pointer is *captured* by the call if it makes a -copy of any part of the pointer that outlives the call. -To be precise, a pointer is captured if one or more of the following conditions -hold: +memory before the call, the call may capture two components of the pointer: + + * The address of the pointer, which is its integral value. This also includes + parts of the address or any information about the address, including the + fact that it does not equal one specific value. We further distinguish + whether only the fact that the address is/isn't null is captured. + * The provenance of the pointer, which is the ability to perform memory + accesses through the pointer, in the sense of the :ref:`pointer aliasing + rules `. We further distinguish whether only read acceses + are allowed, or both reads and writes. + +For example, the following function captures the address of ``%a``, because +it is compared to a pointer, leaking information about the identitiy of the +pointer: + +.. code-block:: llvm + + @glb = global i8 0 + + define i1 @f(ptr %a) { + %c = icmp eq ptr %a, @glb + ret i1 %c + } + +The function does not capture the provenance of the pointer, because the +``icmp`` instruction only operates on the pointer address. The following +function captures both the address and provenance of the pointer, as both +may be read from ``@glb`` after the function returns: + +.. code-block:: llvm + + @glb = global ptr null + + define void @f(ptr %a) { + store ptr %a, ptr @glb + ret void + } + +The following function captures *neither* the address nor the provenance of +the pointer: + +.. code-block:: llvm + + define i32 @f(ptr %a) { + %v = load i32, ptr %a + ret i32 + } + +While address capture includes uses of the address within the body of the +function, provenance capture refers exclusively to the ability to perform +accesses *after* the function returns. Memory accesses within the function +itself are not considered pointer captures. + +We can further say that the capture only occurs through a specific location. +In the following example, the pointer (both address and provenance) is captured +through the return value only: + +.. code-block:: llvm + + define ptr @f(ptr %a) { + %gep = getelementptr i8, ptr %a, i64 4 + ret ptr %gep + } + +However, we always consider direct inspection of the pointer address +(e.g. using ``ptrtoint``) to be location-independent. The following example +is *not* considered a return-only capture, even though the ``ptrtoint`` +ultimately only contribues to the return value: + +.. code-block:: llvm + + @lookup = constant [4 x i8] [i8 0, i8 1, i8 2, i8 3] + + define ptr @f(ptr %a) { + %a.addr = ptrtoint ptr %a to i64 + %mask = and i64 %a.addr, 3 + %gep = getelementptr i8, ptr @lookup, i64 %mask + ret ptr %gep + } + +This definition is chosen to allow capture analysis to continue with the return +value in the usual fashion. + +The following describes possible ways to capture a pointer in more detail, +where unqualified uses of the word "capture" refer to capturing both address +and provenance. 1. The call stores any bit of the pointer carrying information into a place, and the stored bits can be read from the place by the caller after this call @@ -3381,13 +3499,14 @@ hold: @lock = global i1 true define void @f(ptr %a) { - store ptr %a, ptr* @glb + store ptr %a, ptr @glb store atomic i1 false, ptr @lock release ; %a is captured because another thread can safely read @glb store ptr null, ptr @glb ret void } -3. The call's behavior depends on any bit of the pointer carrying information. +3. The call's behavior depends on any bit of the pointer carrying information + (address capture only). .. code-block:: llvm @@ -3395,7 +3514,7 @@ hold: define void @f(ptr %a) { %c = icmp eq ptr %a, @glb - br i1 %c, label %BB_EXIT, label %BB_CONTINUE ; escapes %a + br i1 %c, label %BB_EXIT, label %BB_CONTINUE ; captures address of %a only BB_EXIT: call void @exit() unreachable @@ -3403,8 +3522,7 @@ hold: ret void } -4. The pointer is used in a volatile access as its address. - +4. The pointer is used as the pointer operand of a volatile access. .. _volatile: diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 0dc63f34806b4..a1df0f7d686e6 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -447,6 +447,9 @@ The current vendor extensions supported are: ``experimental-Xqcicsr`` LLVM implements `version 0.2 of the Qualcomm uC CSR extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. +``experimental-Xqciint`` + LLVM implements `version 0.2 of the Qualcomm uC Interrupts extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. + ``experimental-Xqcilsm`` LLVM implements `version 0.2 of the Qualcomm uC Load Store Multiple extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index a3febf27ae833..a9d9e5fc7ace4 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -235,6 +235,8 @@ Changes to the RISC-V Backend extension. * Adds experimental assembler support for the Qualcomm uC 'Xqcicm` (Conditonal Move) extension. +* Adds experimental assembler support for the Qualcomm uC 'Xqciint` (Interrupts) + extension. * Added ``Sdext`` and ``Sdtrig`` extensions. Changes to the WebAssembly Backend @@ -377,6 +379,10 @@ Changes to the LLVM tools Changes to LLDB --------------------------------- +* It is now recommended that LLDB be built with Python >= 3.8, but no changes + have been made to the supported Python versions. The next release, LLDB 21, + will require Python >= 3.8. + * LLDB now supports inline diagnostics for the expression evaluator and command line parser. Old: diff --git a/llvm/include/llvm/ADT/PointerUnion.h b/llvm/include/llvm/ADT/PointerUnion.h index 7d4ed02b62262..cdbd76d7f505b 100644 --- a/llvm/include/llvm/ADT/PointerUnion.h +++ b/llvm/include/llvm/ADT/PointerUnion.h @@ -147,12 +147,18 @@ class PointerUnion // isa, cast and the llvm::dyn_cast /// Test if the Union currently holds the type matching T. - template inline bool is() const { return isa(*this); } + template + [[deprecated("Use isa instead")]] + inline bool is() const { + return isa(*this); + } /// Returns the value of the specified pointer type. /// /// If the specified pointer type is incorrect, assert. - template inline T get() const { + template + [[deprecated("Use cast instead")]] + inline T get() const { assert(isa(*this) && "Invalid accessor called"); return cast(*this); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 752313ab15858..fe13fc676e303 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -211,6 +211,12 @@ typedef TargetTransformInfo TTI; /// for IR-level transformations. class TargetTransformInfo { public: + enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend }; + + /// Get the kind of extension that an instruction represents. + static PartialReductionExtendKind + getPartialReductionExtendKind(Instruction *I); + /// Construct a TTI object using a type implementing the \c Concept /// API below. /// @@ -1280,6 +1286,20 @@ class TargetTransformInfo { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const; + /// \return The cost of a partial reduction, which is a reduction from a + /// vector to another vector with fewer elements of larger size. They are + /// represented by the llvm.experimental.partial.reduce.add intrinsic, which + /// takes an accumulator and a binary operation operand that itself is fed by + /// two extends. An example of an operation that uses a partial reduction is a + /// dot product, which reduces two vectors to another of 4 times fewer and 4 + /// times larger elements. + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, + Type *AccumType, ElementCount VF, + PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const; + /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. @@ -2107,6 +2127,20 @@ class TargetTransformInfo::Concept { /// \return if target want to issue a prefetch in address space \p AS. virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0; + /// \return The cost of a partial reduction, which is a reduction from a + /// vector to another vector with fewer elements of larger size. They are + /// represented by the llvm.experimental.partial.reduce.add intrinsic, which + /// takes an accumulator and a binary operation operand that itself is fed by + /// two extends. An example of an operation that uses a partial reduction is a + /// dot product, which reduces two vectors to another of 4 times fewer and 4 + /// times larger elements. + virtual InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, + Type *AccumType, ElementCount VF, + PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp) const = 0; + virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0; virtual InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, @@ -2786,6 +2820,16 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.shouldPrefetchAddressSpace(AS); } + InstructionCost getPartialReductionCost( + unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, + ElementCount VF, PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const override { + return Impl.getPartialReductionCost(Opcode, InputTypeA, InputTypeB, + AccumType, VF, OpAExtend, OpBExtend, + BinOp); + } + unsigned getMaxInterleaveFactor(ElementCount VF) override { return Impl.getMaxInterleaveFactor(VF); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 9c74b2a0c31df..7ac3063ca9a37 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -585,6 +585,15 @@ class TargetTransformInfoImplBase { bool enableWritePrefetching() const { return false; } bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; } + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, + Type *AccumType, ElementCount VF, + TTI::PartialReductionExtendKind OpAExtend, + TTI::PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const { + return InstructionCost::getInvalid(); + } + unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; } InstructionCost getArithmeticInstrCost( diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h index 8b195b028783f..c01de4a289a69 100644 --- a/llvm/include/llvm/AsmParser/LLParser.h +++ b/llvm/include/llvm/AsmParser/LLParser.h @@ -379,6 +379,7 @@ namespace llvm { bool inAttrGrp, LocTy &BuiltinLoc); bool parseRangeAttr(AttrBuilder &B); bool parseInitializesAttr(AttrBuilder &B); + bool parseCapturesAttr(AttrBuilder &B); bool parseRequiredTypeAttr(AttrBuilder &B, lltok::Kind AttrToken, Attribute::AttrKind AttrKind); diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index 178c911120b4c..7b47bc88ddb25 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -207,6 +207,12 @@ enum Kind { kw_inaccessiblememonly, kw_inaccessiblemem_or_argmemonly, + // Captures attribute: + kw_address, + kw_address_is_null, + kw_provenance, + kw_read_provenance, + // nofpclass attribute: kw_all, kw_nan, diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 21fd27d9838db..9eb38c3e44829 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -788,6 +788,7 @@ enum AttributeKindCodes { ATTR_KIND_NO_EXT = 99, ATTR_KIND_NO_DIVERGENCE_SOURCE = 100, ATTR_KIND_SANITIZE_TYPE = 101, + ATTR_KIND_CAPTURES = 102, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index dbad3469d047d..09a6ca936fe1f 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -457,16 +457,6 @@ template <> struct ScalarTraits { static QuotingType mustQuote(StringRef S) { return needsQuotes(S); } }; -/// Identifies call instruction location in machine function. -struct MachineInstrLoc { - unsigned BlockNum; - unsigned Offset; - - bool operator==(const MachineInstrLoc &Other) const { - return BlockNum == Other.BlockNum && Offset == Other.Offset; - } -}; - /// Serializable representation of CallSiteInfo. struct CallSiteInfo { // Representation of call argument and register which is used to @@ -480,6 +470,16 @@ struct CallSiteInfo { } }; + /// Identifies call instruction location in machine function. + struct MachineInstrLoc { + unsigned BlockNum; + unsigned Offset; + + bool operator==(const MachineInstrLoc &Other) const { + return BlockNum == Other.BlockNum && Offset == Other.Offset; + } + }; + MachineInstrLoc CallLocation; std::vector ArgForwardingRegs; @@ -595,26 +595,6 @@ template <> struct MappingTraits { } }; -struct CalledGlobal { - MachineInstrLoc CallSite; - StringValue Callee; - unsigned Flags; - - bool operator==(const CalledGlobal &Other) const { - return CallSite == Other.CallSite && Callee == Other.Callee && - Flags == Other.Flags; - } -}; - -template <> struct MappingTraits { - static void mapping(IO &YamlIO, CalledGlobal &CG) { - YamlIO.mapRequired("bb", CG.CallSite.BlockNum); - YamlIO.mapRequired("offset", CG.CallSite.Offset); - YamlIO.mapRequired("callee", CG.Callee); - YamlIO.mapRequired("flags", CG.Flags); - } -}; - } // end namespace yaml } // end namespace llvm @@ -626,7 +606,6 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::FixedMachineStackObject) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CallSiteInfo) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineConstantPoolValue) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineJumpTable::Entry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CalledGlobal) namespace llvm { namespace yaml { @@ -785,7 +764,6 @@ struct MachineFunction { std::vector DebugValueSubstitutions; MachineJumpTable JumpTableInfo; std::vector MachineMetadataNodes; - std::vector CalledGlobals; BlockStringValue Body; }; @@ -844,9 +822,6 @@ template <> struct MappingTraits { if (!YamlIO.outputting() || !MF.MachineMetadataNodes.empty()) YamlIO.mapOptional("machineMetadataNodes", MF.MachineMetadataNodes, std::vector()); - if (!YamlIO.outputting() || !MF.CalledGlobals.empty()) - YamlIO.mapOptional("calledGlobals", MF.CalledGlobals, - std::vector()); YamlIO.mapOptional("body", MF.Body, BlockStringValue()); } }; diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index 282aee2a69c4d..d696add8a1af5 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -354,11 +354,6 @@ class LLVM_ABI MachineFunction { /// a table of valid targets for Windows EHCont Guard. std::vector CatchretTargets; - /// Mapping of call instruction to the global value and target flags that it - /// calls, if applicable. - DenseMap> - CalledGlobalsMap; - /// \name Exception Handling /// \{ @@ -1187,26 +1182,6 @@ class LLVM_ABI MachineFunction { CatchretTargets.push_back(Target); } - /// Tries to get the global and target flags for a call site, if the - /// instruction is a call to a global. - std::pair - tryGetCalledGlobal(const MachineInstr *MI) const { - return CalledGlobalsMap.lookup(MI); - } - - /// Notes the global and target flags for a call site. - void addCalledGlobal(const MachineInstr *MI, - std::pair Details) { - assert(MI && "MI must not be null"); - assert(Details.first && "Global must not be null"); - CalledGlobalsMap.insert({MI, Details}); - } - - /// Iterates over the full set of call sites and their associated globals. - auto getCalledGlobals() const { - return llvm::make_range(CalledGlobalsMap.begin(), CalledGlobalsMap.end()); - } - /// \name Exception Handling /// \{ diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index b31ad11c3ee0e..ff7caec41855f 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -293,7 +293,6 @@ class SelectionDAG { MDNode *HeapAllocSite = nullptr; MDNode *PCSections = nullptr; MDNode *MMRA = nullptr; - std::pair CalledGlobal{}; bool NoMerge = false; }; /// Out-of-line extra information for SDNodes. @@ -2374,19 +2373,6 @@ class SelectionDAG { auto It = SDEI.find(Node); return It != SDEI.end() ? It->second.MMRA : nullptr; } - /// Set CalledGlobal to be associated with Node. - void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, - unsigned OpFlags) { - SDEI[Node].CalledGlobal = {GV, OpFlags}; - } - /// Return CalledGlobal associated with Node, or a nullopt if none exists. - std::optional> - getCalledGlobal(const SDNode *Node) { - auto I = SDEI.find(Node); - return I != SDEI.end() - ? std::make_optional(std::move(I->second).CalledGlobal) - : std::nullopt; - } /// Set NoMergeSiteInfo to be associated with Node if NoMerge is true. void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge) { if (NoMerge) diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h index 51ad36bc6b1f8..3132cefeb6c68 100644 --- a/llvm/include/llvm/CodeGen/Spiller.h +++ b/llvm/include/llvm/CodeGen/Spiller.h @@ -19,6 +19,10 @@ class MachineFunction; class MachineFunctionPass; class VirtRegMap; class VirtRegAuxInfo; +class LiveIntervals; +class LiveStacks; +class MachineDominatorTree; +class MachineBlockFrequencyInfo; /// Spiller interface. /// @@ -41,12 +45,20 @@ class Spiller { virtual ArrayRef getReplacedRegs() = 0; virtual void postOptimization() {} + + struct RequiredAnalyses { + LiveIntervals &LIS; + LiveStacks &LSS; + MachineDominatorTree &MDT; + const MachineBlockFrequencyInfo &MBFI; + }; }; /// Create and return a spiller that will insert spill code directly instead /// of deferring though VirtRegMap. -Spiller *createInlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF, - VirtRegMap &VRM, VirtRegAuxInfo &VRAI); +Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses, + MachineFunction &MF, VirtRegMap &VRM, + VirtRegAuxInfo &VRAI); } // end namespace llvm diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 2755ced404ddd..7612e553fe32e 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -284,6 +284,9 @@ class Attribute { /// Returns memory effects. MemoryEffects getMemoryEffects() const; + /// Returns information from captures attribute. + CaptureInfo getCaptureInfo() const; + /// Return the FPClassTest for nofpclass FPClassTest getNoFPClass() const; @@ -436,6 +439,7 @@ class AttributeSet { UWTableKind getUWTableKind() const; AllocFnKind getAllocKind() const; MemoryEffects getMemoryEffects() const; + CaptureInfo getCaptureInfo() const; FPClassTest getNoFPClass() const; std::string getAsString(bool InAttrGrp = false) const; @@ -1260,6 +1264,9 @@ class AttrBuilder { /// Add memory effect attribute. AttrBuilder &addMemoryAttr(MemoryEffects ME); + /// Add captures attribute. + AttrBuilder &addCapturesAttr(CaptureInfo CI); + // Add nofpclass attribute AttrBuilder &addNoFPClassAttr(FPClassTest NoFPClassMask); diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 61955cf883c3f..4396ec4d04c41 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -183,6 +183,9 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>; /// Function creates no aliases of pointer. def NoCapture : EnumAttr<"nocapture", IntersectAnd, [ParamAttr]>; +/// Specify how the pointer may be captured. +def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>; + /// Function is not a source of divergence. def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>; diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 59eb504098837..9a41971b63373 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -1266,15 +1266,10 @@ class ICmpInst: public CmpInst { return getFlippedSignednessPredicate(getPredicate()); } - /// Determine if Pred1 implies Pred2 is true when two compares have matching - /// operands. - static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2); - - /// Determine if Pred1 implies Pred2 is false when two compares have matching - /// operands. - static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2); + /// Determine if Pred1 implies Pred2 is true, false, or if nothing can be + /// inferred about the implication, when two compares have matching operands. + static std::optional isImpliedByMatchingCmp(CmpPredicate Pred1, + CmpPredicate Pred2); void setSameSign(bool B = true) { SubclassOptionalData = (SubclassOptionalData & ~SameSign) | (B * SameSign); diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 3436216d478e3..6ccbb6b185c7d 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1873,13 +1873,13 @@ class ConvergenceControlInst : public IntrinsicInst { return isa(V) && classof(cast(V)); } - bool isAnchor() { + bool isAnchor() const { return getIntrinsicID() == Intrinsic::experimental_convergence_anchor; } - bool isEntry() { + bool isEntry() const { return getIntrinsicID() == Intrinsic::experimental_convergence_entry; } - bool isLoop() { + bool isLoop() const { return getIntrinsicID() == Intrinsic::experimental_convergence_loop; } }; diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index ef48af5b42dbf..2a56ba78ce88e 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -40,6 +40,10 @@ def int_dx_resource_load_rawbuffer : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty], [llvm_any_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; +def int_dx_resource_store_rawbuffer + : DefaultAttrsIntrinsic< + [], [llvm_any_ty, llvm_i32_ty, llvm_i32_ty, llvm_any_ty], + [IntrWriteMem]>; def int_dx_resource_updatecounter : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty], diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index ae04a130bc825..00a76018d8415 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1438,6 +1438,16 @@ let TargetPrefix = "nvvm" in { def int_nvvm_f2tf32_rna : ClangBuiltin<"__nvvm_f2tf32_rna">, Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rna_satfinite : ClangBuiltin<"__nvvm_f2tf32_rna_satfinite">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rn : ClangBuiltin<"__nvvm_f2tf32_rn">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rn_relu : ClangBuiltin<"__nvvm_f2tf32_rn_relu">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rz : ClangBuiltin<"__nvvm_f2tf32_rz">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rz_relu : ClangBuiltin<"__nvvm_f2tf32_rz_relu">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_ff_to_e4m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e4m3x2_rn">, Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index b4d2dce66a6f0..37057271b6c28 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -33,7 +33,7 @@ let TargetPrefix = "spv" in { def int_spv_ptrcast : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_metadata_ty, llvm_i32_ty], [ImmArg>]>; def int_spv_switch : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>; def int_spv_loop_merge : Intrinsic<[], [llvm_vararg_ty]>; - def int_spv_selection_merge : Intrinsic<[], [llvm_vararg_ty]>; + def int_spv_selection_merge : Intrinsic<[], [llvm_any_ty, llvm_i32_ty], [ImmArg>]>; def int_spv_cmpxchg : Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_vararg_ty]>; def int_spv_unreachable : Intrinsic<[], []>; def int_spv_alloca : Intrinsic<[llvm_any_ty], [llvm_i8_ty], [ImmArg>]>; diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h index fb575fe721015..e2a2c84e47910 100644 --- a/llvm/include/llvm/MC/MCObjectFileInfo.h +++ b/llvm/include/llvm/MC/MCObjectFileInfo.h @@ -73,10 +73,6 @@ class MCObjectFileInfo { /// to emit them into. MCSection *CompactUnwindSection = nullptr; - /// If import call optimization is supported by the target, this is the - /// section to emit import call data to. - MCSection *ImportCallSection = nullptr; - // Dwarf sections for debug info. If a target supports debug info, these must // be set. MCSection *DwarfAbbrevSection = nullptr; @@ -273,7 +269,6 @@ class MCObjectFileInfo { MCSection *getBSSSection() const { return BSSSection; } MCSection *getReadOnlySection() const { return ReadOnlySection; } MCSection *getLSDASection() const { return LSDASection; } - MCSection *getImportCallSection() const { return ImportCallSection; } MCSection *getCompactUnwindSection() const { return CompactUnwindSection; } MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; } MCSection *getDwarfInfoSection() const { return DwarfInfoSection; } diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index 558b14cebfd3d..21da4dac4872b 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -569,14 +569,6 @@ class MCStreamer { /// \param Symbol - Symbol the image relative relocation should point to. virtual void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset); - /// Emits the physical number of the section containing the given symbol as - /// assigned during object writing (i.e., this is not a runtime relocation). - virtual void emitCOFFSecNumber(MCSymbol const *Symbol); - - /// Emits the offset of the symbol from the beginning of the section during - /// object writing (i.e., this is not a runtime relocation). - virtual void emitCOFFSecOffset(MCSymbol const *Symbol); - /// Emits an lcomm directive with XCOFF csect information. /// /// \param LabelSym - Label on the block of storage. diff --git a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h index 13d8c7d060c9e..a4ede61e45099 100644 --- a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h +++ b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h @@ -72,7 +72,6 @@ class WinCOFFObjectWriter final : public MCObjectWriter { const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) override; uint64_t writeObject(MCAssembler &Asm) override; - int getSectionNumber(const MCSection &Section) const; }; /// Construct a new Win COFF writer instance. diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h index 2425abe51e6dd..5c39d80538944 100644 --- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h +++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h @@ -58,8 +58,6 @@ class MCWinCOFFStreamer : public MCObjectStreamer { void emitCOFFSectionIndex(MCSymbol const *Symbol) override; void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override; void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override; - void emitCOFFSecNumber(MCSymbol const *Symbol) override; - void emitCOFFSecOffset(MCSymbol const *Symbol) override; void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size, Align ByteAlignment) override; void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 29763995e8b51..8a43197d2d45e 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -194,7 +194,6 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS( #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, PASS_NAME) #endif -DUMMY_FUNCTION_PASS("atomic-expand", AtomicExpandPass) #undef DUMMY_FUNCTION_PASS #ifndef DUMMY_MACHINE_MODULE_PASS diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index d7c1eda81c006..34a7feb63bec4 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -2547,13 +2547,9 @@ class ICmpInst : public CmpInst { WRAP_STATIC_PREDICATE(isGE); WRAP_STATIC_PREDICATE(isLE); - static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2) { - return llvm::ICmpInst::isImpliedTrueByMatchingCmp(Pred1, Pred2); - } - static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2) { - return llvm::ICmpInst::isImpliedFalseByMatchingCmp(Pred1, Pred2); + static std::optional isImpliedByMatchingCmp(CmpPredicate Pred1, + CmpPredicate Pred2) { + return llvm::ICmpInst::isImpliedByMatchingCmp(Pred1, Pred2); } static auto predicates() { return llvm::ICmpInst::predicates(); } diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index 5a9d80c87ae27..9ecdab71ec8ca 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -273,6 +273,107 @@ raw_ostream &operator<<(raw_ostream &OS, MemoryEffects RMRB); // Legacy alias. using FunctionModRefBehavior = MemoryEffects; +/// Components of the pointer that may be captured. +enum class CaptureComponents : uint8_t { + None = 0, + AddressIsNull = (1 << 0), + Address = (1 << 1) | AddressIsNull, + ReadProvenance = (1 << 2), + Provenance = (1 << 3) | ReadProvenance, + All = Address | Provenance, + LLVM_MARK_AS_BITMASK_ENUM(Provenance), +}; + +inline bool capturesNothing(CaptureComponents CC) { + return CC == CaptureComponents::None; +} + +inline bool capturesAnything(CaptureComponents CC) { + return CC != CaptureComponents::None; +} + +inline bool capturesAddressIsNullOnly(CaptureComponents CC) { + return (CC & CaptureComponents::Address) == CaptureComponents::AddressIsNull; +} + +inline bool capturesAddress(CaptureComponents CC) { + return (CC & CaptureComponents::Address) != CaptureComponents::None; +} + +inline bool capturesReadProvenanceOnly(CaptureComponents CC) { + return (CC & CaptureComponents::Provenance) == + CaptureComponents::ReadProvenance; +} + +inline bool capturesFullProvenance(CaptureComponents CC) { + return (CC & CaptureComponents::Provenance) == CaptureComponents::Provenance; +} + +raw_ostream &operator<<(raw_ostream &OS, CaptureComponents CC); + +/// Represents which components of the pointer may be captured in which +/// location. This represents the captures(...) attribute in IR. +/// +/// For more information on the precise semantics see LangRef. +class CaptureInfo { + CaptureComponents OtherComponents; + CaptureComponents RetComponents; + +public: + CaptureInfo(CaptureComponents OtherComponents, + CaptureComponents RetComponents) + : OtherComponents(OtherComponents), RetComponents(RetComponents) {} + + CaptureInfo(CaptureComponents Components) + : OtherComponents(Components), RetComponents(Components) {} + + /// Create CaptureInfo that may capture all components of the pointer. + static CaptureInfo all() { return CaptureInfo(CaptureComponents::All); } + + /// Get components potentially captured by the return value. + CaptureComponents getRetComponents() const { return RetComponents; } + + /// Get components potentially captured through locations other than the + /// return value. + CaptureComponents getOtherComponents() const { return OtherComponents; } + + /// Get the potentially captured components of the pointer (regardless of + /// location). + operator CaptureComponents() const { return OtherComponents | RetComponents; } + + bool operator==(CaptureInfo Other) const { + return OtherComponents == Other.OtherComponents && + RetComponents == Other.RetComponents; + } + + bool operator!=(CaptureInfo Other) const { return !(*this == Other); } + + /// Compute union of CaptureInfos. + CaptureInfo operator|(CaptureInfo Other) const { + return CaptureInfo(OtherComponents | Other.OtherComponents, + RetComponents | Other.RetComponents); + } + + /// Compute intersection of CaptureInfos. + CaptureInfo operator&(CaptureInfo Other) const { + return CaptureInfo(OtherComponents & Other.OtherComponents, + RetComponents & Other.RetComponents); + } + + static CaptureInfo createFromIntValue(uint32_t Data) { + return CaptureInfo(CaptureComponents(Data >> 4), + CaptureComponents(Data & 0xf)); + } + + /// Convert CaptureInfo into an encoded integer value (used by captures + /// attribute). + uint32_t toIntValue() const { + return (uint32_t(OtherComponents) << 4) | uint32_t(RetComponents); + } +}; + +raw_ostream &operator<<(raw_ostream &OS, CaptureInfo Info); + } // namespace llvm #endif diff --git a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h index 836fc907375d3..ab2dcee06551e 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h +++ b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h @@ -29,6 +29,7 @@ class BoundsCheckingPass : public PassInfoMixin { }; std::optional Rt; // Trap if empty. bool Merge = false; + std::optional GuardKind; // `allow_ubsan_check` argument. }; BoundsCheckingPass(Options Opts) : Opts(Opts) {} diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 38e9145826c08..2a68979add666 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1921,6 +1921,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( if (StrideAPtr && *StrideAPtr < 0) { std::swap(Src, Sink); std::swap(AInst, BInst); + std::swap(ATy, BTy); std::swap(StrideAPtr, StrideBPtr); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index b32dffa9f0fe8..df42dc2746daf 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -863,6 +863,15 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const { return TTIImpl->shouldPrefetchAddressSpace(AS); } +InstructionCost TargetTransformInfo::getPartialReductionCost( + unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, + ElementCount VF, PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, std::optional BinOp) const { + return TTIImpl->getPartialReductionCost(Opcode, InputTypeA, InputTypeB, + AccumType, VF, OpAExtend, OpBExtend, + BinOp); +} + unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } @@ -974,6 +983,15 @@ InstructionCost TargetTransformInfo::getShuffleCost( return Cost; } +TargetTransformInfo::PartialReductionExtendKind +TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { + if (isa(I)) + return PR_SignExtend; + if (isa(I)) + return PR_ZeroExtend; + return PR_None; +} + TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0e50fc60ce792..d03e6f5a5754d 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -9384,19 +9384,6 @@ isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS, } } -/// Return true if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is true. -/// Return false if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is false. -/// Otherwise, return std::nullopt if we can't infer anything. -static std::optional isImpliedCondMatchingOperands(CmpPredicate LPred, - CmpPredicate RPred) { - if (ICmpInst::isImpliedTrueByMatchingCmp(LPred, RPred)) - return true; - if (ICmpInst::isImpliedFalseByMatchingCmp(LPred, RPred)) - return false; - - return std::nullopt; -} - /// Return true if "icmp LPred X, LCR" implies "icmp RPred X, RCR" is true. /// Return false if "icmp LPred X, LCR" implies "icmp RPred X, RCR" is false. /// Otherwise, return std::nullopt if we can't infer anything. @@ -9489,7 +9476,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0, // Can we infer anything when the two compares have matching operands? if (L0 == R0 && L1 == R1) - return isImpliedCondMatchingOperands(LPred, RPred); + return ICmpInst::isImpliedByMatchingCmp(LPred, RPred); // It only really makes sense in the context of signed comparison for "X - Y // must be positive if X >= Y and no overflow". @@ -9499,7 +9486,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0, CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SGE)) && match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) { if (match(R1, m_NonPositive()) && - isImpliedCondMatchingOperands(LPred, RPred) == false) + ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == false) return false; } @@ -9509,7 +9496,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0, CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SLE)) && match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) { if (match(R1, m_NonNegative()) && - isImpliedCondMatchingOperands(LPred, RPred) == true) + ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == true) return true; } diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 1b8e033134f51..5ea507c009bdc 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -704,6 +704,10 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(argmemonly); KEYWORD(inaccessiblememonly); KEYWORD(inaccessiblemem_or_argmemonly); + KEYWORD(address_is_null); + KEYWORD(address); + KEYWORD(provenance); + KEYWORD(read_provenance); // nofpclass attribute KEYWORD(all); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 52d48a69f0eb5..81d048b32e139 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -1644,6 +1644,8 @@ bool LLParser::parseEnumAttribute(Attribute::AttrKind Attr, AttrBuilder &B, return parseRangeAttr(B); case Attribute::Initializes: return parseInitializesAttr(B); + case Attribute::Captures: + return parseCapturesAttr(B); default: B.addAttribute(Attr); Lex.Lex(); @@ -3165,6 +3167,65 @@ bool LLParser::parseInitializesAttr(AttrBuilder &B) { return false; } +bool LLParser::parseCapturesAttr(AttrBuilder &B) { + CaptureComponents Other = CaptureComponents::None; + std::optional Ret; + + // We use syntax like captures(ret: address, provenance), so the colon + // should not be interpreted as a label terminator. + Lex.setIgnoreColonInIdentifiers(true); + auto _ = make_scope_exit([&] { Lex.setIgnoreColonInIdentifiers(false); }); + + Lex.Lex(); + if (parseToken(lltok::lparen, "expected '('")) + return true; + + CaptureComponents *Current = &Other; + bool SeenComponent = false; + while (true) { + if (EatIfPresent(lltok::kw_ret)) { + if (parseToken(lltok::colon, "expected ':'")) + return true; + if (Ret) + return tokError("duplicate 'ret' location"); + Ret = CaptureComponents::None; + Current = &*Ret; + SeenComponent = false; + } + + if (EatIfPresent(lltok::kw_none)) { + if (SeenComponent) + return tokError("cannot use 'none' with other component"); + *Current = CaptureComponents::None; + } else { + if (SeenComponent && capturesNothing(*Current)) + return tokError("cannot use 'none' with other component"); + + if (EatIfPresent(lltok::kw_address_is_null)) + *Current |= CaptureComponents::AddressIsNull; + else if (EatIfPresent(lltok::kw_address)) + *Current |= CaptureComponents::Address; + else if (EatIfPresent(lltok::kw_provenance)) + *Current |= CaptureComponents::Provenance; + else if (EatIfPresent(lltok::kw_read_provenance)) + *Current |= CaptureComponents::ReadProvenance; + else + return tokError("expected one of 'none', 'address', 'address_is_null', " + "'provenance' or 'read_provenance'"); + } + + SeenComponent = true; + if (EatIfPresent(lltok::rparen)) + break; + + if (parseToken(lltok::comma, "expected ',' or ')'")) + return true; + } + + B.addCapturesAttr(CaptureInfo(Other, Ret.value_or(Other))); + return false; +} + /// parseOptionalOperandBundles /// ::= /*empty*/ /// ::= '[' OperandBundle [, OperandBundle ]* ']' diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index a01ecf0d56642..56f5ff4b20e5d 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2250,6 +2250,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::CoroElideSafe; case bitc::ATTR_KIND_NO_EXT: return Attribute::NoExt; + case bitc::ATTR_KIND_CAPTURES: + return Attribute::Captures; } } @@ -2389,6 +2391,8 @@ Error BitcodeReader::parseAttributeGroupBlock() { B.addAllocKindAttr(static_cast(Record[++i])); else if (Kind == Attribute::Memory) B.addMemoryAttr(MemoryEffects::createFromIntValue(Record[++i])); + else if (Kind == Attribute::Captures) + B.addCapturesAttr(CaptureInfo::createFromIntValue(Record[++i])); else if (Kind == Attribute::NoFPClass) B.addNoFPClassAttr( static_cast(Record[++i] & fcAllFlags)); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index b4efd3928a2e6..94d3afa6c1e33 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -907,6 +907,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_INITIALIZES; case Attribute::NoExt: return bitc::ATTR_KIND_NO_EXT; + case Attribute::Captures: + return bitc::ATTR_KIND_CAPTURES; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 64f290f5930a1..f6681540e2286 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -75,7 +75,6 @@ RestrictStatepointRemat("restrict-statepoint-remat", cl::desc("Restrict remat for statepoint operands")); namespace { - class HoistSpillHelper : private LiveRangeEdit::Delegate { MachineFunction &MF; LiveIntervals &LIS; @@ -128,15 +127,11 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate { DenseMap &SpillsToIns); public: - HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf, - VirtRegMap &vrm) - : MF(mf), LIS(pass.getAnalysis().getLIS()), - LSS(pass.getAnalysis().getLS()), - MDT(pass.getAnalysis().getDomTree()), + HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses, + MachineFunction &mf, VirtRegMap &vrm) + : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT), VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), - TRI(*mf.getSubtarget().getRegisterInfo()), - MBFI( - pass.getAnalysis().getMBFI()), + TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI), IPA(LIS, mf.getNumBlockIDs()) {} void addToMergeableSpills(MachineInstr &Spill, int StackSlot, @@ -150,12 +145,10 @@ class InlineSpiller : public Spiller { MachineFunction &MF; LiveIntervals &LIS; LiveStacks &LSS; - MachineDominatorTree &MDT; VirtRegMap &VRM; MachineRegisterInfo &MRI; const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; - const MachineBlockFrequencyInfo &MBFI; // Variables that are valid during spill(), but used by multiple methods. LiveRangeEdit *Edit = nullptr; @@ -190,16 +183,12 @@ class InlineSpiller : public Spiller { ~InlineSpiller() override = default; public: - InlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF, VirtRegMap &VRM, - VirtRegAuxInfo &VRAI) - : MF(MF), LIS(Pass.getAnalysis().getLIS()), - LSS(Pass.getAnalysis().getLS()), - MDT(Pass.getAnalysis().getDomTree()), - VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()), - TRI(*MF.getSubtarget().getRegisterInfo()), - MBFI( - Pass.getAnalysis().getMBFI()), - HSpiller(Pass, MF, VRM), VRAI(VRAI) {} + InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF, + VirtRegMap &VRM, VirtRegAuxInfo &VRAI) + : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM), + MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), HSpiller(Analyses, MF, VRM), + VRAI(VRAI) {} void spill(LiveRangeEdit &) override; ArrayRef getSpilledRegs() override { return RegsToSpill; } @@ -237,10 +226,11 @@ Spiller::~Spiller() = default; void Spiller::anchor() {} -Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass, - MachineFunction &MF, VirtRegMap &VRM, - VirtRegAuxInfo &VRAI) { - return new InlineSpiller(Pass, MF, VRM, VRAI); +Spiller * +llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses, + MachineFunction &MF, VirtRegMap &VRM, + VirtRegAuxInfo &VRAI) { + return new InlineSpiller(Analyses, MF, VRM, VRAI); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index de2fe925c2d5c..e2543f883f91c 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -158,9 +158,6 @@ class MIRParserImpl { MachineFunction &MF, const yaml::MachineFunction &YMF); - bool parseCalledGlobals(PerFunctionMIParsingState &PFS, MachineFunction &MF, - const yaml::MachineFunction &YMF); - private: bool parseMDNode(PerFunctionMIParsingState &PFS, MDNode *&Node, const yaml::StringValue &Source); @@ -186,9 +183,6 @@ class MIRParserImpl { void setupDebugValueTracking(MachineFunction &MF, PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF); - - bool parseMachineInst(MachineFunction &MF, yaml::MachineInstrLoc MILoc, - MachineInstr const *&MI); }; } // end namespace llvm @@ -463,34 +457,24 @@ bool MIRParserImpl::computeFunctionProperties( return false; } -bool MIRParserImpl::parseMachineInst(MachineFunction &MF, - yaml::MachineInstrLoc MILoc, - MachineInstr const *&MI) { - if (MILoc.BlockNum >= MF.size()) { - return error(Twine(MF.getName()) + - Twine(" instruction block out of range.") + - " Unable to reference bb:" + Twine(MILoc.BlockNum)); - } - auto BB = std::next(MF.begin(), MILoc.BlockNum); - if (MILoc.Offset >= BB->size()) - return error( - Twine(MF.getName()) + Twine(" instruction offset out of range.") + - " Unable to reference instruction at bb: " + Twine(MILoc.BlockNum) + - " at offset:" + Twine(MILoc.Offset)); - MI = &*std::next(BB->instr_begin(), MILoc.Offset); - return false; -} - bool MIRParserImpl::initializeCallSiteInfo( PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF) { MachineFunction &MF = PFS.MF; SMDiagnostic Error; const TargetMachine &TM = MF.getTarget(); for (auto &YamlCSInfo : YamlMF.CallSitesInfo) { - yaml::MachineInstrLoc MILoc = YamlCSInfo.CallLocation; - const MachineInstr *CallI; - if (parseMachineInst(MF, MILoc, CallI)) - return true; + yaml::CallSiteInfo::MachineInstrLoc MILoc = YamlCSInfo.CallLocation; + if (MILoc.BlockNum >= MF.size()) + return error(Twine(MF.getName()) + + Twine(" call instruction block out of range.") + + " Unable to reference bb:" + Twine(MILoc.BlockNum)); + auto CallB = std::next(MF.begin(), MILoc.BlockNum); + if (MILoc.Offset >= CallB->size()) + return error(Twine(MF.getName()) + + Twine(" call instruction offset out of range.") + + " Unable to reference instruction at bb: " + + Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset)); + auto CallI = std::next(CallB->instr_begin(), MILoc.Offset); if (!CallI->isCall(MachineInstr::IgnoreBundle)) return error(Twine(MF.getName()) + Twine(" call site info should reference call " @@ -657,9 +641,6 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, if (initializeCallSiteInfo(PFS, YamlMF)) return true; - if (parseCalledGlobals(PFS, MF, YamlMF)) - return true; - setupDebugValueTracking(MF, PFS, YamlMF); MF.getSubtarget().mirFileLoaded(MF); @@ -1130,37 +1111,6 @@ bool MIRParserImpl::parseMachineMetadataNodes( return false; } -bool MIRParserImpl::parseCalledGlobals(PerFunctionMIParsingState &PFS, - MachineFunction &MF, - const yaml::MachineFunction &YMF) { - Function &F = MF.getFunction(); - for (const auto &YamlCG : YMF.CalledGlobals) { - yaml::MachineInstrLoc MILoc = YamlCG.CallSite; - const MachineInstr *CallI; - if (parseMachineInst(MF, MILoc, CallI)) - return true; - if (!CallI->isCall(MachineInstr::IgnoreBundle)) - return error(Twine(MF.getName()) + - Twine(" called global should reference call " - "instruction. Instruction at bb:") + - Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset) + - " is not a call instruction"); - - auto Callee = - F.getParent()->getValueSymbolTable().lookup(YamlCG.Callee.Value); - if (!Callee) - return error(YamlCG.Callee.SourceRange.Start, - "use of undefined global '" + YamlCG.Callee.Value + "'"); - if (!isa(Callee)) - return error(YamlCG.Callee.SourceRange.Start, - "use of non-global value '" + YamlCG.Callee.Value + "'"); - - MF.addCalledGlobal(CallI, {cast(Callee), YamlCG.Flags}); - } - - return false; -} - SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error, SMRange SourceRange) { assert(SourceRange.isValid() && "Invalid source range"); diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index b6da495590fe1..c8f6341c1224d 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -133,9 +133,6 @@ class MIRPrinter { void convertMachineMetadataNodes(yaml::MachineFunction &YMF, const MachineFunction &MF, MachineModuleSlotTracker &MST); - void convertCalledGlobals(yaml::MachineFunction &YMF, - const MachineFunction &MF, - MachineModuleSlotTracker &MST); private: void initRegisterMaskIds(const MachineFunction &MF); @@ -272,8 +269,6 @@ void MIRPrinter::print(const MachineFunction &MF) { // function. convertMachineMetadataNodes(YamlMF, MF, MST); - convertCalledGlobals(YamlMF, MF, MST); - yaml::Output Out(OS); if (!SimplifyMIR) Out.setWriteDefaultValues(true); @@ -560,7 +555,7 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF, const auto *TRI = MF.getSubtarget().getRegisterInfo(); for (auto CSInfo : MF.getCallSitesInfo()) { yaml::CallSiteInfo YmlCS; - yaml::MachineInstrLoc CallLocation; + yaml::CallSiteInfo::MachineInstrLoc CallLocation; // Prepare instruction position. MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator(); @@ -601,32 +596,6 @@ void MIRPrinter::convertMachineMetadataNodes(yaml::MachineFunction &YMF, } } -void MIRPrinter::convertCalledGlobals(yaml::MachineFunction &YMF, - const MachineFunction &MF, - MachineModuleSlotTracker &MST) { - for (const auto &[CallInst, CG] : MF.getCalledGlobals()) { - // If the call instruction was dropped, then we don't need to print it. - auto BB = CallInst->getParent(); - if (BB) { - yaml::MachineInstrLoc CallSite; - CallSite.BlockNum = CallInst->getParent()->getNumber(); - CallSite.Offset = std::distance(CallInst->getParent()->instr_begin(), - CallInst->getIterator()); - - yaml::CalledGlobal YamlCG{CallSite, CG.first->getName().str(), CG.second}; - YMF.CalledGlobals.push_back(YamlCG); - } - } - - // Sort by position of call instructions. - llvm::sort(YMF.CalledGlobals.begin(), YMF.CalledGlobals.end(), - [](yaml::CalledGlobal A, yaml::CalledGlobal B) { - if (A.CallSite.BlockNum == B.CallSite.BlockNum) - return A.CallSite.Offset < B.CallSite.Offset; - return A.CallSite.BlockNum < B.CallSite.BlockNum; - }); -} - void MIRPrinter::convert(yaml::MachineFunction &MF, const MachineConstantPool &ConstantPool) { unsigned ID = 0; diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index c05aa1e40e477..f3f34f890be11 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" @@ -187,6 +188,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); AU.addRequiredID(MachineDominatorsID); AU.addPreservedID(MachineDominatorsID); AU.addRequired(); @@ -310,16 +312,20 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) { << "********** Function: " << mf.getName() << '\n'); MF = &mf; + auto &MBFI = getAnalysis().getMBFI(); + auto &LiveStks = getAnalysis().getLS(); + auto &MDT = getAnalysis().getDomTree(); + RegAllocBase::init(getAnalysis().getVRM(), getAnalysis().getLIS(), getAnalysis().getLRM()); - VirtRegAuxInfo VRAI( - *MF, *LIS, *VRM, getAnalysis().getLI(), - getAnalysis().getMBFI(), - &getAnalysis().getPSI()); + VirtRegAuxInfo VRAI(*MF, *LIS, *VRM, + getAnalysis().getLI(), MBFI, + &getAnalysis().getPSI()); VRAI.calculateSpillWeightsAndHints(); - SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI)); + SpillerInstance.reset( + createInlineSpiller({*LIS, LiveStks, MDT, MBFI}, *MF, *VRM, VRAI)); allocatePhysRegs(); postOptimization(); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index b94992c20b119..66e9cf546b837 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2750,6 +2750,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { Bundles = &getAnalysis().getEdgeBundles(); SpillPlacer = &getAnalysis().getResult(); DebugVars = &getAnalysis().getLDV(); + auto &LSS = getAnalysis().getLS(); initializeCSRCost(); @@ -2770,7 +2771,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { getAnalysis().getAdvisor(*MF, *this); VRAI = std::make_unique(*MF, *LIS, *VRM, *Loops, *MBFI); - SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, *VRAI)); + SpillerInstance.reset( + createInlineSpiller({*LIS, LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI)); VRAI->calculateSpillWeightsAndHints(); diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp index 696c312e4ba00..e230a1be95c9f 100644 --- a/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -794,6 +794,9 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { MachineBlockFrequencyInfo &MBFI = getAnalysis().getMBFI(); + auto &LiveStks = getAnalysis().getLS(); + auto &MDT = getAnalysis().getDomTree(); + VirtRegMap &VRM = getAnalysis().getVRM(); PBQPVirtRegAuxInfo VRAI( @@ -807,7 +810,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { VirtRegAuxInfo DefaultVRAI( MF, LIS, VRM, getAnalysis().getLI(), MBFI); std::unique_ptr VRegSpiller( - createInlineSpiller(*this, MF, VRM, DefaultVRAI)); + createInlineSpiller({LIS, LiveStks, MDT, MBFI}, MF, VRM, DefaultVRAI)); MF.getRegInfo().freezeReservedRegs(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index da3c834417d6b..02b79c67af3ee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16012,6 +16012,14 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT)) return CombineLD; + // int_vt (bitcast (vec_vt (scalar_to_vector elt_vt:x))) + // => int_vt (any_extend elt_vt:x) + if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isScalarInteger()) { + SDValue SrcScalar = N0.getOperand(0); + if (SrcScalar.getValueType().isScalarInteger()) + return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, SrcScalar); + } + // Remove double bitcasts from shuffles - this is often a legacy of // XformToShuffleWithZero being used to combine bitmaskings (of // float vectors bitcast to integer vectors) into shuffles. diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index bafe26ff7d6b7..dff7243b0a99c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -908,10 +908,6 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { It->setMMRAMetadata(MF, MMRA); } - if (auto CalledGlobal = DAG->getCalledGlobal(Node)) - if (CalledGlobal->first) - MF.addCalledGlobal(MI, *CalledGlobal); - return MI; }; diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h index 82c501dcafcb7..59cc489ade40d 100644 --- a/llvm/lib/IR/AttributeImpl.h +++ b/llvm/lib/IR/AttributeImpl.h @@ -346,6 +346,7 @@ class AttributeSetNode final UWTableKind getUWTableKind() const; AllocFnKind getAllocKind() const; MemoryEffects getMemoryEffects() const; + CaptureInfo getCaptureInfo() const; FPClassTest getNoFPClass() const; std::string getAsString(bool InAttrGrp) const; Type *getAttributeType(Attribute::AttrKind Kind) const; diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index e9daa01b899e8..ceb31856283c9 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -487,6 +487,12 @@ MemoryEffects Attribute::getMemoryEffects() const { return MemoryEffects::createFromIntValue(pImpl->getValueAsInt()); } +CaptureInfo Attribute::getCaptureInfo() const { + assert(hasAttribute(Attribute::Captures) && + "Can only call getCaptureInfo() on captures attribute"); + return CaptureInfo::createFromIntValue(pImpl->getValueAsInt()); +} + FPClassTest Attribute::getNoFPClass() const { assert(hasAttribute(Attribute::NoFPClass) && "Can only call getNoFPClass() on nofpclass attribute"); @@ -647,6 +653,13 @@ std::string Attribute::getAsString(bool InAttrGrp) const { return Result; } + if (hasAttribute(Attribute::Captures)) { + std::string Result; + raw_string_ostream OS(Result); + OS << getCaptureInfo(); + return Result; + } + if (hasAttribute(Attribute::NoFPClass)) { std::string Result = "nofpclass"; raw_string_ostream OS(Result); @@ -1050,6 +1063,10 @@ AttributeSet::intersectWith(LLVMContext &C, AttributeSet Other) const { Intersected.addMemoryAttr(Attr0.getMemoryEffects() | Attr1.getMemoryEffects()); break; + case Attribute::Captures: + Intersected.addCapturesAttr(Attr0.getCaptureInfo() | + Attr1.getCaptureInfo()); + break; case Attribute::NoFPClass: Intersected.addNoFPClassAttr(Attr0.getNoFPClass() & Attr1.getNoFPClass()); @@ -1170,6 +1187,10 @@ MemoryEffects AttributeSet::getMemoryEffects() const { return SetNode ? SetNode->getMemoryEffects() : MemoryEffects::unknown(); } +CaptureInfo AttributeSet::getCaptureInfo() const { + return SetNode ? SetNode->getCaptureInfo() : CaptureInfo::all(); +} + FPClassTest AttributeSet::getNoFPClass() const { return SetNode ? SetNode->getNoFPClass() : fcNone; } @@ -1358,6 +1379,12 @@ MemoryEffects AttributeSetNode::getMemoryEffects() const { return MemoryEffects::unknown(); } +CaptureInfo AttributeSetNode::getCaptureInfo() const { + if (auto A = findEnumAttribute(Attribute::Captures)) + return A->getCaptureInfo(); + return CaptureInfo::all(); +} + FPClassTest AttributeSetNode::getNoFPClass() const { if (auto A = findEnumAttribute(Attribute::NoFPClass)) return A->getNoFPClass(); @@ -2190,6 +2217,10 @@ AttrBuilder &AttrBuilder::addMemoryAttr(MemoryEffects ME) { return addRawIntAttr(Attribute::Memory, ME.toIntValue()); } +AttrBuilder &AttrBuilder::addCapturesAttr(CaptureInfo CI) { + return addRawIntAttr(Attribute::Captures, CI.toIntValue()); +} + AttrBuilder &AttrBuilder::addNoFPClassAttr(FPClassTest Mask) { if (Mask == fcNone) return *this; @@ -2350,7 +2381,8 @@ AttributeMask AttributeFuncs::typeIncompatible(Type *Ty, AttributeSet AS, .addAttribute(Attribute::DereferenceableOrNull) .addAttribute(Attribute::Writable) .addAttribute(Attribute::DeadOnUnwind) - .addAttribute(Attribute::Initializes); + .addAttribute(Attribute::Initializes) + .addAttribute(Attribute::Captures); if (ASK & ASK_UNSAFE_TO_DROP) Incompatible.addAttribute(Attribute::Nest) .addAttribute(Attribute::SwiftError) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 49c148bb68a4d..b8b2c1d7f9a85 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -3886,8 +3886,7 @@ bool CmpInst::isFalseWhenEqual(Predicate predicate) { } } -bool ICmpInst::isImpliedTrueByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2) { +static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1, CmpPredicate Pred2) { // If the predicates match, then we know the first condition implies the // second is true. if (CmpPredicate::getMatching(Pred1, Pred2)) @@ -3901,25 +3900,35 @@ bool ICmpInst::isImpliedTrueByMatchingCmp(CmpPredicate Pred1, switch (Pred1) { default: break; - case ICMP_EQ: + case CmpInst::ICMP_EQ: // A == B implies A >=u B, A <=u B, A >=s B, and A <=s B are true. - return Pred2 == ICMP_UGE || Pred2 == ICMP_ULE || Pred2 == ICMP_SGE || - Pred2 == ICMP_SLE; - case ICMP_UGT: // A >u B implies A != B and A >=u B are true. - return Pred2 == ICMP_NE || Pred2 == ICMP_UGE; - case ICMP_ULT: // A s B implies A != B and A >=s B are true. - return Pred2 == ICMP_NE || Pred2 == ICMP_SGE; - case ICMP_SLT: // A u B implies A != B and A >=u B are true. + return Pred2 == CmpInst::ICMP_NE || Pred2 == CmpInst::ICMP_UGE; + case CmpInst::ICMP_ULT: // A s B implies A != B and A >=s B are true. + return Pred2 == CmpInst::ICMP_NE || Pred2 == CmpInst::ICMP_SGE; + case CmpInst::ICMP_SLT: // A ICmpInst::isImpliedByMatchingCmp(CmpPredicate Pred1, + CmpPredicate Pred2) { + if (isImpliedTrueByMatchingCmp(Pred1, Pred2)) + return true; + if (isImpliedFalseByMatchingCmp(Pred1, Pred2)) + return false; + return std::nullopt; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index dd8058c6d5cd8..01fe11ed20501 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -209,8 +209,6 @@ class MCAsmStreamer final : public MCStreamer { void emitCOFFSectionIndex(MCSymbol const *Symbol) override; void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override; void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override; - void emitCOFFSecNumber(MCSymbol const *Symbol) override; - void emitCOFFSecOffset(MCSymbol const *Symbol) override; void emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size, MCSymbol *CsectSym, Align Alignment) override; void emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol, @@ -895,18 +893,6 @@ void MCAsmStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) { EmitEOL(); } -void MCAsmStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) { - OS << "\t.secnum\t"; - Symbol->print(OS, MAI); - EmitEOL(); -} - -void MCAsmStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { - OS << "\t.secoffset\t"; - Symbol->print(OS, MAI); - EmitEOL(); -} - // We need an XCOFF-specific version of this directive as the AIX syntax // requires a QualName argument identifying the csect name and storage mapping // class to appear before the alignment if we are specifying it. diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index 150e38a94db6a..f37e138edc36b 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -596,11 +596,6 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) { COFF::IMAGE_SCN_MEM_READ); } - if (T.getArch() == Triple::aarch64) { - ImportCallSection = - Ctx->getCOFFSection(".impcall", COFF::IMAGE_SCN_LNK_INFO); - } - // Debug info. COFFDebugSymbolsSection = Ctx->getCOFFSection(".debug$S", (COFF::IMAGE_SCN_MEM_DISCARDABLE | diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp index dd5ce9964a194..4d95a72085283 100644 --- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp @@ -70,8 +70,6 @@ class COFFAsmParser : public MCAsmParserExtension { addDirectiveHandler<&COFFAsmParser::parseDirectiveSymbolAttribute>( ".weak_anti_dep"); addDirectiveHandler<&COFFAsmParser::parseDirectiveCGProfile>(".cg_profile"); - addDirectiveHandler<&COFFAsmParser::parseDirectiveSecNum>(".secnum"); - addDirectiveHandler<&COFFAsmParser::parseDirectiveSecOffset>(".secoffset"); // Win64 EH directives. addDirectiveHandler<&COFFAsmParser::parseSEHDirectiveStartProc>( @@ -128,8 +126,6 @@ class COFFAsmParser : public MCAsmParserExtension { bool parseDirectiveLinkOnce(StringRef, SMLoc); bool parseDirectiveRVA(StringRef, SMLoc); bool parseDirectiveCGProfile(StringRef, SMLoc); - bool parseDirectiveSecNum(StringRef, SMLoc); - bool parseDirectiveSecOffset(StringRef, SMLoc); // Win64 EH directives. bool parseSEHDirectiveStartProc(StringRef, SMLoc); @@ -581,36 +577,6 @@ bool COFFAsmParser::parseDirectiveSymIdx(StringRef, SMLoc) { return false; } -bool COFFAsmParser::parseDirectiveSecNum(StringRef, SMLoc) { - StringRef SymbolID; - if (getParser().parseIdentifier(SymbolID)) - return TokError("expected identifier in directive"); - - if (getLexer().isNot(AsmToken::EndOfStatement)) - return TokError("unexpected token in directive"); - - MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID); - - Lex(); - getStreamer().emitCOFFSecNumber(Symbol); - return false; -} - -bool COFFAsmParser::parseDirectiveSecOffset(StringRef, SMLoc) { - StringRef SymbolID; - if (getParser().parseIdentifier(SymbolID)) - return TokError("expected identifier in directive"); - - if (getLexer().isNot(AsmToken::EndOfStatement)) - return TokError("unexpected token in directive"); - - MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID); - - Lex(); - getStreamer().emitCOFFSecOffset(Symbol); - return false; -} - /// ::= [ identifier ] bool COFFAsmParser::parseCOMDATType(COFF::COMDATType &Type) { StringRef TypeId = getTok().getIdentifier(); diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index e690723c0e502..ccf65df150e78 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -1023,10 +1023,6 @@ void MCStreamer::emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {} void MCStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {} -void MCStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {} - -void MCStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {} - /// EmitRawText - If this file is backed by an assembly streamer, this dumps /// the specified string in the output .s file. This capability is /// indicated by the hasRawTextSupport() predicate. diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index 8fd46bc8b0255..395d4db3103d7 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -29,7 +29,6 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSymbolCOFF.h" #include "llvm/MC/MCTargetOptions.h" -#include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" @@ -44,91 +43,6 @@ using namespace llvm; #define DEBUG_TYPE "WinCOFFStreamer" -/// MCExpr that represents the physical number for the sections that contains -/// a symbol. -class MCCOFFSectionNumberTargetExpr final : public MCTargetExpr { - const MCSymbol &SectionSymbol; - const WinCOFFObjectWriter &Writer; - - MCCOFFSectionNumberTargetExpr(const MCSymbol &SectionSymbol_, - const WinCOFFObjectWriter &Writer_) - : SectionSymbol(SectionSymbol_), Writer(Writer_) {} - -public: - static MCCOFFSectionNumberTargetExpr * - create(const MCSymbol &SectionSymbol, const WinCOFFObjectWriter &Writer, - MCContext &Ctx) { - return new (Ctx) MCCOFFSectionNumberTargetExpr(SectionSymbol, Writer); - } - - void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override { - OS << ":secnum:"; - SectionSymbol.print(OS, MAI); - } - - bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, - const MCFixup *Fixup) const override { - auto sectionNumber = Writer.getSectionNumber(SectionSymbol.getSection()); - assert(sectionNumber != 0 && - "Containing section was not assigned a number"); - Res = MCValue::get(sectionNumber); - return true; - } - - void visitUsedExpr(MCStreamer &Streamer) const override { - // Contains no sub-expressions. - } - - MCFragment *findAssociatedFragment() const override { - return SectionSymbol.getFragment(); - } - - void fixELFSymbolsInTLSFixups(MCAssembler &) const override { - llvm_unreachable("Not supported for ELF"); - } -}; - -/// MCExpr that represents the offset to a symbol from the beginning of its -/// section. -class MCCOFFSectionOffsetTargetExpr final : public MCTargetExpr { - const MCSymbol &Symbol; - - MCCOFFSectionOffsetTargetExpr(const MCSymbol &Symbol_) : Symbol(Symbol_) {} - -public: - static MCCOFFSectionOffsetTargetExpr *create(const MCSymbol &Symbol, - MCContext &Ctx) { - return new (Ctx) MCCOFFSectionOffsetTargetExpr(Symbol); - } - - void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override { - OS << ":secoffset:"; - Symbol.print(OS, MAI); - } - - bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, - const MCFixup *Fixup) const override { - uint64_t CallsiteOffset = 0; - if (!Asm->getSymbolOffset(Symbol, CallsiteOffset)) { - return true; - } - Res = MCValue::get(CallsiteOffset); - return true; - } - - void visitUsedExpr(MCStreamer &Streamer) const override { - // Contains no sub-expressions. - } - - MCFragment *findAssociatedFragment() const override { - return Symbol.getFragment(); - } - - void fixELFSymbolsInTLSFixups(MCAssembler &) const override { - llvm_unreachable("Not supported for ELF"); - } -}; - MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context, std::unique_ptr MAB, std::unique_ptr CE, @@ -366,34 +280,6 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol, DF->appendContents(4, 0); } -void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) { - visitUsedSymbol(*Symbol); - MCDataFragment *DF = getOrCreateDataFragment(); - // Create Symbol for section number. - const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create( - *Symbol, this->getWriter(), getContext()); - // Build the relocation. - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4); - // Record the relocation. - DF->getFixups().push_back(Fixup); - // Emit 4 bytes (zeros) to the object file. - DF->appendContents(4, 0); -} - -void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { - visitUsedSymbol(*Symbol); - MCDataFragment *DF = getOrCreateDataFragment(); - // Create Symbol for section offset. - const MCExpr *MCE = - MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext()); - // Build the relocation. - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4); - // Record the relocation. - DF->getFixups().push_back(Fixup); - // Emit 4 bytes (zeros) to the object file. - DF->appendContents(4, 0); -} - void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size, Align ByteAlignment) { auto *Symbol = cast(S); diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 39e02d0522bcf..09d2b08e43050 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -163,7 +163,6 @@ class llvm::WinCOFFWriter { const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue); uint64_t writeObject(MCAssembler &Asm); - int getSectionNumber(const MCSection &Section) const; private: COFFSymbol *createSymbol(StringRef Name); @@ -819,15 +818,6 @@ void WinCOFFWriter::executePostLayoutBinding(MCAssembler &Asm) { if (!Symbol.isTemporary() || cast(Symbol).getClass() == COFF::IMAGE_SYM_CLASS_STATIC) defineSymbol(Asm, Symbol); - - UseBigObj = Sections.size() > COFF::MaxNumberOfSections16; - Header.NumberOfSections = Sections.size(); - Header.NumberOfSymbols = 0; - if (Sections.size() > INT32_MAX) - report_fatal_error( - "PE COFF object files can't have more than 2147483647 sections"); - - assignSectionNumbers(); } void WinCOFFWriter::recordRelocation(MCAssembler &Asm, @@ -990,7 +980,16 @@ static std::time_t getTime() { uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) { uint64_t StartOffset = W.OS.tell(); + if (Sections.size() > INT32_MAX) + report_fatal_error( + "PE COFF object files can't have more than 2147483647 sections"); + + UseBigObj = Sections.size() > COFF::MaxNumberOfSections16; + Header.NumberOfSections = Sections.size(); + Header.NumberOfSymbols = 0; + setWeakDefaultNames(); + assignSectionNumbers(); if (Mode != DwoOnly) createFileSymbols(Asm); @@ -1144,10 +1143,6 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) { return W.OS.tell() - StartOffset; } -int WinCOFFWriter::getSectionNumber(const MCSection &Section) const { - return SectionMap.at(&Section)->Number; -} - //------------------------------------------------------------------------------ // WinCOFFObjectWriter class implementation @@ -1199,10 +1194,6 @@ uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm) { return TotalSize; } -int WinCOFFObjectWriter::getSectionNumber(const MCSection &Section) const { - return ObjWriter->getSectionNumber(Section); -} - MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_) : Machine(Machine_) {} diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index aac4407740055..f923d5aabe0a0 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1315,10 +1315,18 @@ parseBoundsCheckingOptions(StringRef Params) { } else if (ParamName == "merge") { Options.Merge = true; } else { - return make_error( - formatv("invalid BoundsChecking pass parameter '{0}' ", ParamName) - .str(), - inconvertibleErrorCode()); + StringRef ParamEQ; + StringRef Val; + std::tie(ParamEQ, Val) = ParamName.split('='); + int8_t Id = 0; + if (ParamEQ == "guard" && !Val.getAsInteger(0, Id)) { + Options.GuardKind = Id; + } else { + return make_error( + formatv("invalid BoundsChecking pass parameter '{0}' ", ParamName) + .str(), + inconvertibleErrorCode()); + } } } return Options; diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp index a4eb70edd38d1..d3b3dd11171f1 100644 --- a/llvm/lib/Support/ModRef.cpp +++ b/llvm/lib/Support/ModRef.cpp @@ -12,6 +12,7 @@ #include "llvm/Support/ModRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" using namespace llvm; @@ -50,3 +51,36 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { }); return OS; } + +raw_ostream &llvm::operator<<(raw_ostream &OS, CaptureComponents CC) { + if (capturesNothing(CC)) { + OS << "none"; + return OS; + } + + ListSeparator LS; + if (capturesAddressIsNullOnly(CC)) + OS << LS << "address_is_null"; + else if (capturesAddress(CC)) + OS << LS << "address"; + if (capturesReadProvenanceOnly(CC)) + OS << LS << "read_provenance"; + if (capturesFullProvenance(CC)) + OS << LS << "provenance"; + + return OS; +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, CaptureInfo CI) { + ListSeparator LS; + CaptureComponents Other = CI.getOtherComponents(); + CaptureComponents Ret = CI.getRetComponents(); + + OS << "captures("; + if (!capturesNothing(Other) || Other == Ret) + OS << LS << Other; + if (Other != Ret) + OS << LS << "ret: " << Ret; + OS << ")"; + return OS; +} diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index e23aec6efba59..983242ade0fe5 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -235,8 +235,7 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { return tgtok::dot; case '\r': - PrintFatalError("getNextChar() must never return '\r'"); - return tgtok::Error; + llvm_unreachable("getNextChar() must never return '\r'"); case ' ': case '\t': @@ -664,11 +663,10 @@ bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { PrepIncludeStack.pop_back(); if (IncludeStackMustBeEmpty) { - if (!PrepIncludeStack.empty()) - PrintFatalError("preprocessor include stack is not empty"); + assert(PrepIncludeStack.empty() && + "preprocessor include stack is not empty"); } else { - if (PrepIncludeStack.empty()) - PrintFatalError("preprocessor include stack is empty"); + assert(!PrepIncludeStack.empty() && "preprocessor include stack is empty"); } return true; @@ -718,27 +716,25 @@ tgtok::TokKind TGLexer::prepIsDirective() const { return tgtok::Error; } -bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { +void TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { TokStart = CurPtr; - for (const auto [PKind, PWord] : PreprocessorDirs) + for (const auto [PKind, PWord] : PreprocessorDirs) { if (PKind == Kind) { // Advance CurPtr to the end of the preprocessing word. CurPtr += PWord.size(); - return true; + return; } + } - PrintFatalError("unsupported preprocessing token in " - "prepEatPreprocessorDirective()"); - return false; + llvm_unreachable( + "unsupported preprocessing token in prepEatPreprocessorDirective()"); } tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, bool ReturnNextLiveToken) { // We must be looking at a preprocessing directive. Eat it! - if (!prepEatPreprocessorDirective(Kind)) - PrintFatalError("lexPreprocessor() called for unknown " - "preprocessor directive"); + prepEatPreprocessorDirective(Kind); if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { StringRef MacroName = prepLexMacroName(); @@ -818,13 +814,11 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, if (PrepIncludeStack.back().empty()) return ReturnError(TokStart, "#endif without #ifdef"); - auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); + [[maybe_unused]] auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); - if (IfdefOrElseEntry.Kind != tgtok::Ifdef && - IfdefOrElseEntry.Kind != tgtok::Else) { - PrintFatalError("invalid preprocessor control on the stack"); - return tgtok::Error; - } + assert((IfdefOrElseEntry.Kind == tgtok::Ifdef || + IfdefOrElseEntry.Kind == tgtok::Else) && + "invalid preprocessor control on the stack"); if (!prepSkipDirectiveEnd()) return ReturnError(CurPtr, "only comments are supported after #endif"); @@ -852,21 +846,17 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, return ReturnError(CurPtr, "only comments are supported after #define NAME"); - if (!ReturnNextLiveToken) { - PrintFatalError("#define must be ignored during the lines skipping"); - return tgtok::Error; - } + assert(ReturnNextLiveToken && + "#define must be ignored during the lines skipping"); return LexToken(); } - PrintFatalError("preprocessing directive is not supported"); - return tgtok::Error; + llvm_unreachable("preprocessing directive is not supported"); } bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { - if (!MustNeverBeFalse) - PrintFatalError("invalid recursion."); + assert(MustNeverBeFalse && "invalid recursion."); do { // Skip all symbols to the line end. @@ -902,20 +892,17 @@ bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { if (ProcessedKind == tgtok::Error) return false; - if (Kind != ProcessedKind) - PrintFatalError("prepIsDirective() and lexPreprocessor() " - "returned different token kinds"); + assert(Kind == ProcessedKind && "prepIsDirective() and lexPreprocessor() " + "returned different token kinds"); // If this preprocessing directive enables tokens processing, // then return to the lexPreprocessor() and get to the next token. // We can move from line-skipping mode to processing tokens only // due to #else or #endif. if (prepIsProcessingEnabled()) { - if (Kind != tgtok::Else && Kind != tgtok::Endif) { - PrintFatalError("tokens processing was enabled by an unexpected " - "preprocessing directive"); - return false; - } + assert((Kind == tgtok::Else || Kind == tgtok::Endif) && + "tokens processing was enabled by an unexpected preprocessing " + "directive"); return true; } @@ -1053,10 +1040,6 @@ bool TGLexer::prepIsProcessingEnabled() { } void TGLexer::prepReportPreprocessorStackError() { - if (PrepIncludeStack.back().empty()) - PrintFatalError("prepReportPreprocessorStackError() called with " - "empty control stack"); - auto &PrepControl = PrepIncludeStack.back().back(); PrintError(CurBuf.end(), "reached EOF without matching #endif"); PrintError(PrepControl.SrcPos, "the latest preprocessor control is here"); diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index f8b32dc5377f5..bac583c4e33a1 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -347,14 +347,13 @@ class TGLexer { tgtok::TokKind prepIsDirective() const; // Given a preprocessing token kind, adjusts CurPtr to the end - // of the preprocessing directive word. Returns true, unless - // an unsupported token kind is passed in. + // of the preprocessing directive word. // // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() // to avoid adjusting CurPtr before we are sure that '#' is followed // by a preprocessing directive. If it is not, then we fall back to // tgtok::paste interpretation of '#'. - bool prepEatPreprocessorDirective(tgtok::TokKind Kind); + void prepEatPreprocessorDirective(tgtok::TokKind Kind); // The main "exit" point from the token parsing to preprocessor. // diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index e3dd334e7b098..20e77b3be2a27 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -63,7 +63,7 @@ def SVE2p1Unsupported : AArch64Unsupported; def SVE2Unsupported : AArch64Unsupported { let F = !listconcat([HasSVE2, HasSVE2orSME, HasSVE2orSME2, HasSSVE_FP8FMA, HasSMEF8F16, - HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm, + HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVEBitPerm, HasSVEB16B16], SVE2p1Unsupported.F); } @@ -74,7 +74,7 @@ def SVEUnsupported : AArch64Unsupported { } let F = [HasSME2p2, HasSVE2p2orSME2p2, HasNonStreamingSVEorSME2p2, - HasNonStreamingSVE2p2orSME2p2] in + HasNonStreamingSVE2p2orSME2p2, HasNonStreamingSVE2orSSVE_BitPerm] in def SME2p2Unsupported : AArch64Unsupported; def SME2p1Unsupported : AArch64Unsupported { diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 27e65d60122fd..9d9d9889b3858 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -24,7 +24,6 @@ #include "MCTargetDesc/AArch64TargetStreamer.h" #include "TargetInfo/AArch64TargetInfo.h" #include "Utils/AArch64BaseInfo.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" @@ -78,11 +77,6 @@ static cl::opt PtrauthAuthChecks( cl::desc("Check pointer authentication auth/resign failures"), cl::init(Default)); -static cl::opt EnableImportCallOptimization( - "aarch64-win-import-call-optimization", cl::Hidden, - cl::desc("Enable import call optimization for AArch64 Windows"), - cl::init(false)); - #define DEBUG_TYPE "asm-printer" namespace { @@ -95,8 +89,6 @@ class AArch64AsmPrinter : public AsmPrinter { #ifndef NDEBUG unsigned InstsEmitted; #endif - DenseMap>> - SectionToImportedFunctionCalls; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) @@ -301,11 +293,6 @@ class AArch64AsmPrinter : public AsmPrinter { MCSymbol *LazyPointer) override; void emitMachOIFuncStubHelperBody(Module &M, const GlobalIFunc &GI, MCSymbol *LazyPointer) override; - - /// Checks if this instruction is part of a sequence that is eligle for import - /// call optimization and, if so, records it to be emitted in the import call - /// section. - void recordIfImportCall(const MachineInstr *BranchInst); }; } // end anonymous namespace @@ -943,38 +930,6 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) { // Emit stack and fault map information. FM.serializeToFaultMapSection(); - // If import call optimization is enabled, emit the appropriate section. - // We do this whether or not we recorded any import calls. - if (EnableImportCallOptimization && TT.isOSBinFormatCOFF()) { - OutStreamer->switchSection(getObjFileLowering().getImportCallSection()); - - // Section always starts with some magic. - constexpr char ImpCallMagic[12] = "Imp_Call_V1"; - OutStreamer->emitBytes(StringRef{ImpCallMagic, sizeof(ImpCallMagic)}); - - // Layout of this section is: - // Per section that contains calls to imported functions: - // uint32_t SectionSize: Size in bytes for information in this section. - // uint32_t Section Number - // Per call to imported function in section: - // uint32_t Kind: the kind of imported function. - // uint32_t BranchOffset: the offset of the branch instruction in its - // parent section. - // uint32_t TargetSymbolId: the symbol id of the called function. - for (auto &[Section, CallsToImportedFuncs] : - SectionToImportedFunctionCalls) { - unsigned SectionSize = - sizeof(uint32_t) * (2 + 3 * CallsToImportedFuncs.size()); - OutStreamer->emitInt32(SectionSize); - OutStreamer->emitCOFFSecNumber(Section->getBeginSymbol()); - for (auto &[CallsiteSymbol, CalledSymbol] : CallsToImportedFuncs) { - // Kind is always IMAGE_REL_ARM64_DYNAMIC_IMPORT_CALL (0x13). - OutStreamer->emitInt32(0x13); - OutStreamer->emitCOFFSecOffset(CallsiteSymbol); - OutStreamer->emitCOFFSymbolIndex(CalledSymbol); - } - } - } } void AArch64AsmPrinter::emitLOHs() { @@ -2748,7 +2703,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { case AArch64::TCRETURNriALL: { emitPtrauthTailCallHardening(MI); - recordIfImportCall(MI); MCInst TmpInst; TmpInst.setOpcode(AArch64::BR); TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); @@ -2760,7 +2714,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { MCOperand Dest; MCInstLowering.lowerOperand(MI->getOperand(0), Dest); - recordIfImportCall(MI); MCInst TmpInst; TmpInst.setOpcode(AArch64::B); TmpInst.addOperand(Dest); @@ -3091,14 +3044,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { TS->emitARM64WinCFISaveAnyRegQPX(MI->getOperand(0).getImm(), -MI->getOperand(2).getImm()); return; - - case AArch64::BLR: - case AArch64::BR: - recordIfImportCall(MI); - MCInst TmpInst; - MCInstLowering.Lower(MI, TmpInst); - EmitToStreamer(*OutStreamer, TmpInst); - return; } // Finally, do the automated lowerings for everything else. @@ -3107,23 +3052,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); } -void AArch64AsmPrinter::recordIfImportCall( - const llvm::MachineInstr *BranchInst) { - if (!EnableImportCallOptimization || - !TM.getTargetTriple().isOSBinFormatCOFF()) - return; - - auto [GV, OpFlags] = BranchInst->getMF()->tryGetCalledGlobal(BranchInst); - if (GV && GV->hasDLLImportStorageClass()) { - auto *CallSiteSymbol = MMI->getContext().createNamedTempSymbol("impcall"); - OutStreamer->emitLabel(CallSiteSymbol); - - auto *CalledSymbol = MCInstLowering.GetGlobalValueSymbol(GV, OpFlags); - SectionToImportedFunctionCalls[OutStreamer->getCurrentSectionOnly()] - .push_back({CallSiteSymbol, CalledSymbol}); - } -} - void AArch64AsmPrinter::emitMachOIFuncStubBody(Module &M, const GlobalIFunc &GI, MCSymbol *LazyPointer) { // _ifunc: diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 41eb9a73bd013..76405750db640 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -376,9 +376,11 @@ def FeatureSVE2SM4 : ExtensionWithMArch<"sve2-sm4", "SVE2SM4", "FEAT_SVE_SM4", def FeatureSVE2SHA3 : ExtensionWithMArch<"sve2-sha3", "SVE2SHA3", "FEAT_SVE_SHA3", "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>; -def FeatureSVE2BitPerm : ExtensionWithMArch<"sve2-bitperm", "SVE2BitPerm", - "FEAT_SVE_BitPerm", - "Enable bit permutation SVE2 instructions", [FeatureSVE2]>; +def FeatureSVEBitPerm : ExtensionWithMArch<"sve-bitperm", "SVEBitPerm", + "FEAT_SVE_BitPerm", "Enable bit permutation SVE2 instructions">; + +def FeatureAliasSVE2BitPerm : ExtensionWithMArch<"sve2-bitperm", "SVE2BitPerm", + "", "Shorthand for +sve2+sve-bitperm", [FeatureSVE2, FeatureSVEBitPerm]>; def FeatureTRBE : Extension<"trbe", "TRBE", "FEAT_TRBE", "Enable Trace Buffer Extension">; @@ -486,16 +488,16 @@ def FeatureSSVE_FP8FMA : ExtensionWithMArch<"ssve-fp8fma", "SSVE_FP8FMA", "FEAT_ "Enable SVE2 FP8 multiply-add instructions", [FeatureSME2, FeatureFP8]>; def FeatureFP8DOT4: ExtensionWithMArch<"fp8dot4", "FP8DOT4", "FEAT_FP8DOT4", - "Enable FP8 4-way dot instructions", [FeatureFP8FMA]>; + "Enable FP8 4-way dot instructions", [FeatureNEON, FeatureFP8]>; def FeatureFP8DOT2: ExtensionWithMArch<"fp8dot2", "FP8DOT2", "FEAT_FP8DOT2", - "Enable FP8 2-way dot instructions", [FeatureFP8DOT4]>; + "Enable FP8 2-way dot instructions", [FeatureNEON, FeatureFP8]>; def FeatureSSVE_FP8DOT4 : ExtensionWithMArch<"ssve-fp8dot4", "SSVE_FP8DOT4", "FEAT_SSVE_FP8DOT4", - "Enable SVE2 FP8 4-way dot product instructions", [FeatureSSVE_FP8FMA]>; + "Enable SVE2 FP8 4-way dot product instructions", [FeatureSME2, FeatureFP8]>; def FeatureSSVE_FP8DOT2 : ExtensionWithMArch<"ssve-fp8dot2", "SSVE_FP8DOT2", "FEAT_SSVE_FP8DOT2", - "Enable SVE2 FP8 2-way dot product instructions", [FeatureSSVE_FP8DOT4]>; + "Enable SVE2 FP8 2-way dot product instructions", [FeatureSME2, FeatureFP8]>; def FeatureSME_LUTv2 : ExtensionWithMArch<"sme-lutv2", "SME_LUTv2", "FEAT_SME_LUTv2", "Enable Scalable Matrix Extension (SME) LUTv2 instructions", [FeatureSME2]>; @@ -504,7 +506,7 @@ def FeatureSMEF8F32 : ExtensionWithMArch<"sme-f8f32", "SMEF8F32", "FEAT_SME_F8F3 "Enable Scalable Matrix Extension (SME) F8F32 instructions", [FeatureSME2, FeatureFP8]>; def FeatureSMEF8F16 : ExtensionWithMArch<"sme-f8f16", "SMEF8F16", "FEAT_SME_F8F16", - "Enable Scalable Matrix Extension (SME) F8F16 instructions", [FeatureSMEF8F32]>; + "Enable Scalable Matrix Extension (SME) F8F16 instructions", [FeatureSME2, FeatureFP8]>; def FeatureCPA : ExtensionWithMArch<"cpa", "CPA", "FEAT_CPA", "Enable Armv9.5-A Checked Pointer Arithmetic">; @@ -565,6 +567,9 @@ def FeaturePCDPHINT: ExtensionWithMArch<"pcdphint", "PCDPHINT", "FEAT_PCDPHINT", def FeaturePoPS: ExtensionWithMArch<"pops", "PoPS", "FEAT_PoPS", "Enable Armv9.6-A Point Of Physical Storage (PoPS) DC instructions">; +def FeatureSSVE_BitPerm : ExtensionWithMArch<"ssve-bitperm", "SSVE_BitPerm", "FEAT_SSVE_BitPerm", + "Enable Armv9.6-A SVE BitPerm support in streaming SVE mode", [FeatureSME2, FeatureSVEBitPerm]>; + //===----------------------------------------------------------------------===// // Other Features //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d9877fef1437c..7e82a433a85ad 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9450,14 +9450,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - const GlobalValue *CalledGlobal = nullptr; - unsigned OpFlags = 0; if (auto *G = dyn_cast(Callee)) { - CalledGlobal = G->getGlobal(); - OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal, - getTargetMachine()); + auto GV = G->getGlobal(); + unsigned OpFlags = + Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); if (OpFlags & AArch64II::MO_GOT) { - Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags); + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } else { const GlobalValue *GV = G->getGlobal(); @@ -9577,8 +9575,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); - if (CalledGlobal) - DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags); return Ret; } @@ -9590,8 +9586,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InGlue = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); - if (CalledGlobal) - DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; @@ -9664,7 +9658,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getConstant(0, DL, MVT::i64)); TPIDR2.Uses++; } else if (RequiresSaveAllZA) { - Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain, + Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result, /*IsSave=*/false); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c6f5cdcd1d5fe..948701f897855 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -155,8 +155,8 @@ def HasSVE2SM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">; def HasSVE2SHA3 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">, AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">; -def HasSVE2BitPerm : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2BitPerm()">, - AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; +def HasSVEBitPerm : Predicate<"Subtarget->hasSVEBitPerm()">, + AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">; def HasSMEandIsNonStreamingSafe : Predicate<"Subtarget->hasSME()">, AssemblerPredicateWithAll<(all_of FeatureSME), "sme">; @@ -286,6 +286,10 @@ def HasNonStreamingSVE2p2orSME2p2 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">, AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2), "sme2p2 or sve2p2">; +def HasNonStreamingSVE2orSSVE_BitPerm + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_BitPerm())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm), "sve2 or ssve-bitperm">; // A subset of NEON instructions are legal in Streaming SVE execution mode, // so don't need the additional check for 'isNeonAvailable'. diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 2da67126a1753..364ab0d82bf88 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -694,7 +694,7 @@ def ProcessorFeatures { FeatureLSE, FeatureRAS, FeatureRDM]; list A510 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureMatMulInt8, FeatureBF16, FeatureAM, - FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureMTE, FeatureETE, FeatureSVEBitPerm, FeatureFP16FML, FeatureCCIDX, FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2, @@ -702,7 +702,7 @@ def ProcessorFeatures { FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM]; list A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, - FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureMTE, FeatureETE, FeatureSVEBitPerm, FeatureFP16FML, FeatureCCIDX, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, @@ -711,7 +711,7 @@ def ProcessorFeatures { FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureDotProd]; list A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, - FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureMTE, FeatureETE, FeatureSVEBitPerm, FeatureFP16FML, FeatureCCIDX, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, @@ -747,14 +747,14 @@ def ProcessorFeatures { list A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureCCIDX, FeatureSSBS, FeatureETE, FeatureMTE, FeatureFP16FML, - FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8, + FeatureSVEBitPerm, FeatureBF16, FeatureMatMulInt8, FeaturePAuth, FeatureFlagM, FeatureSB, FeatureSVE, FeatureSVE2, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM]; list A715 = [HasV9_0aOps, FeatureNEON, FeatureMTE, FeatureCCIDX, FeatureFP16FML, FeatureSVE, FeatureTRBE, - FeatureSVE2BitPerm, FeatureBF16, FeatureETE, + FeatureSVEBitPerm, FeatureBF16, FeatureETE, FeaturePerfMon, FeatureMatMulInt8, FeatureSPE, FeatureSB, FeatureSSBS, FeatureFullFP16, FeaturePAuth, FeaturePredRes, FeatureFlagM, FeatureSVE2, FeatureComplxNum, FeatureCRC, @@ -763,7 +763,7 @@ def ProcessorFeatures { FeatureRCPC, FeatureRDM]; list A720 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureCCIDX, - FeatureTRBE, FeatureSVE2BitPerm, FeatureETE, + FeatureTRBE, FeatureSVEBitPerm, FeatureETE, FeaturePerfMon, FeatureSPE, FeatureSPE_EEF, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, @@ -772,7 +772,7 @@ def ProcessorFeatures { FeatureRCPC, FeatureRDM]; list A720AE = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureCCIDX, - FeatureTRBE, FeatureSVE2BitPerm, FeatureETE, + FeatureTRBE, FeatureSVEBitPerm, FeatureETE, FeaturePerfMon, FeatureSPE, FeatureSPE_EEF, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, @@ -782,7 +782,7 @@ def ProcessorFeatures { list A725 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureCCIDX, FeatureETE, FeaturePerfMon, FeatureSPE, - FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE, + FeatureSVEBitPerm, FeatureSPE_EEF, FeatureTRBE, FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, @@ -814,7 +814,7 @@ def ProcessorFeatures { FeatureRCPC, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM]; list X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureMatMulInt8, FeatureBF16, FeatureAM, - FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureMTE, FeatureETE, FeatureSVEBitPerm, FeatureFP16FML, FeatureCCIDX, FeaturePAuth, FeatureSSBS, FeatureSB, FeatureSVE, FeatureSVE2, FeatureFlagM, @@ -823,7 +823,7 @@ def ProcessorFeatures { list X3 = [HasV9_0aOps, FeatureSVE, FeatureNEON, FeaturePerfMon, FeatureETE, FeatureTRBE, FeatureSPE, FeatureBF16, FeatureMatMulInt8, - FeatureMTE, FeatureSVE2BitPerm, FeatureFullFP16, + FeatureMTE, FeatureSVEBitPerm, FeatureFullFP16, FeatureFP16FML, FeatureCCIDX, FeatureSB, FeaturePAuth, FeaturePredRes, FeatureFlagM, FeatureSSBS, @@ -831,7 +831,7 @@ def ProcessorFeatures { FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureDotProd]; list X4 = [HasV9_2aOps, FeaturePerfMon, FeatureETE, FeatureTRBE, - FeatureSPE, FeatureMTE, FeatureSVE2BitPerm, + FeatureSPE, FeatureMTE, FeatureSVEBitPerm, FeatureFP16FML, FeatureSPE_EEF, FeatureCCIDX, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, @@ -841,7 +841,7 @@ def ProcessorFeatures { list X925 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureCCIDX, FeatureETE, FeaturePerfMon, FeatureSPE, - FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE, + FeatureSVEBitPerm, FeatureSPE_EEF, FeatureTRBE, FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, @@ -855,7 +855,7 @@ def ProcessorFeatures { FeatureFPAC, FeatureFP16FML, FeatureRandGen, FeatureSSBS, FeatureLS64, FeatureCLRBHB, FeatureSPECRES2, FeatureSVEAES, FeatureSVE2SM4, - FeatureSVE2SHA3, FeatureSVE2BitPerm, FeatureETE, + FeatureSVE2SHA3, FeatureSVE2, FeatureSVEBitPerm, FeatureETE, FeatureMEC, FeatureFP8DOT2]; list Carmel = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES, FeatureFullFP16, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM, @@ -942,7 +942,7 @@ def ProcessorFeatures { FeaturePerfMon, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM]; list NeoverseN2 = [HasV9_0aOps, FeatureBF16, FeatureETE, FeatureFP16FML, FeatureMatMulInt8, FeatureMTE, FeatureSVE2, - FeatureSVE2BitPerm, FeatureTRBE, + FeatureSVEBitPerm, FeatureTRBE, FeaturePerfMon, FeatureCCIDX, FeatureDotProd, FeatureFullFP16, FeatureSB, FeatureSSBS, FeatureSVE, @@ -951,7 +951,7 @@ def ProcessorFeatures { list NeoverseN3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML, FeatureFullFP16, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, FeatureSPE_EEF, - FeatureSVE2BitPerm, + FeatureSVEBitPerm, FeatureCCIDX, FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, @@ -978,7 +978,7 @@ def ProcessorFeatures { FeatureRCPC, FeatureRDM]; list NeoverseV2 = [HasV9_0aOps, FeatureBF16, FeatureSPE, FeaturePerfMon, FeatureETE, FeatureMatMulInt8, - FeatureNEON, FeatureSVE2BitPerm, FeatureFP16FML, + FeatureNEON, FeatureSVEBitPerm, FeatureFP16FML, FeatureMTE, FeatureRandGen, FeatureCCIDX, FeatureSVE, FeatureSVE2, FeatureSSBS, FeatureFullFP16, FeatureDotProd, @@ -988,7 +988,7 @@ def ProcessorFeatures { FeatureFullFP16, FeatureLS64, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, FeatureCCIDX, - FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE, + FeatureSPE_EEF, FeatureSVEBitPerm, FeatureBRBE, FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8, FeatureMatMulInt8, FeatureJS, FeatureLSE, @@ -996,7 +996,7 @@ def ProcessorFeatures { list NeoverseV3AE = [HasV9_2aOps, FeatureETE, FeatureFP16FML, FeatureFullFP16, FeatureLS64, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, - FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE, + FeatureSPE_EEF, FeatureSVEBitPerm, FeatureBRBE, FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, FeatureCCIDX, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 7dd6d49bf2022..22715c61126d1 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3946,12 +3946,12 @@ let Predicates = [HasSVE2SHA3] in { defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>; } // End HasSVE2SHA3 -let Predicates = [HasSVE2BitPerm] in { +let Predicates = [HasSVEBitPerm, HasNonStreamingSVE2orSSVE_BitPerm] in { // SVE2 bitwise permute defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>; defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>; defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>; -} // End HasSVE2BitPerm +} let Predicates = [HasSVEAES2, HasNonStreamingSVE2p1orSSVE_AES] in { // SVE_AES2 multi-vector instructions (x2) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 214fb4e352eeb..8e7e590c173ff 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -23,6 +23,7 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/Support/InstructionCost.h" #include #include @@ -357,6 +358,68 @@ class AArch64TTIImpl : public BasicTTIImplBase { return BaseT::isLegalNTLoad(DataType, Alignment); } + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, + Type *AccumType, ElementCount VF, + TTI::PartialReductionExtendKind OpAExtend, + TTI::PartialReductionExtendKind OpBExtend, + std::optional BinOp) const { + + InstructionCost Invalid = InstructionCost::getInvalid(); + InstructionCost Cost(TTI::TCC_Basic); + + if (Opcode != Instruction::Add) + return Invalid; + + if (InputTypeA != InputTypeB) + return Invalid; + + EVT InputEVT = EVT::getEVT(InputTypeA); + EVT AccumEVT = EVT::getEVT(AccumType); + + if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable()) + return Invalid; + if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd())) + return Invalid; + + if (InputEVT == MVT::i8) { + switch (VF.getKnownMinValue()) { + default: + return Invalid; + case 8: + if (AccumEVT == MVT::i32) + Cost *= 2; + else if (AccumEVT != MVT::i64) + return Invalid; + break; + case 16: + if (AccumEVT == MVT::i64) + Cost *= 2; + else if (AccumEVT != MVT::i32) + return Invalid; + break; + } + } else if (InputEVT == MVT::i16) { + // FIXME: Allow i32 accumulator but increase cost, as we would extend + // it to i64. + if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64) + return Invalid; + } else + return Invalid; + + // AArch64 supports lowering mixed extensions to a usdot but only if the + // i8mm or sve/streaming features are available. + if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None || + (OpAExtend != OpBExtend && !ST->hasMatMulInt8() && + !ST->isSVEorStreamingSVEAvailable())) + return Invalid; + + if (!BinOp || *BinOp != Instruction::Mul) + return Invalid; + + return Cost; + } + bool enableOrderedReductions() const { return true; } InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index f44afd804c2bd..c37c57590f906 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -3755,7 +3755,10 @@ static const struct Extension { {"sve2-aes", {AArch64::FeatureAliasSVE2AES, AArch64::FeatureSVEAES}}, {"sve2-sm4", {AArch64::FeatureSVE2SM4}}, {"sve2-sha3", {AArch64::FeatureSVE2SHA3}}, - {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}}, + {"sve-bitperm", {AArch64::FeatureSVEBitPerm}}, + {"sve2-bitperm", + {AArch64::FeatureAliasSVE2BitPerm, AArch64::FeatureSVEBitPerm, + AArch64::FeatureSVE2}}, {"sve2p1", {AArch64::FeatureSVE2p1}}, {"ls64", {AArch64::FeatureLS64}}, {"xs", {AArch64::FeatureXS}}, @@ -3827,6 +3830,7 @@ static const struct Extension { {"lsui", {AArch64::FeatureLSUI}}, {"occmo", {AArch64::FeatureOCCMO}}, {"pcdphint", {AArch64::FeaturePCDPHINT}}, + {"ssve-bitperm", {AArch64::FeatureSSVE_BitPerm}}, }; static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ad5ee75f0c5d1..400c5f219cc70 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -380,7 +380,7 @@ extern char &AMDGPUAnnotateUniformValuesLegacyPassID; void initializeAMDGPUCodeGenPreparePass(PassRegistry&); extern char &AMDGPUCodeGenPrepareID; -void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &); +void initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(PassRegistry &); extern char &AMDGPURemoveIncompatibleFunctionsID; void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &); @@ -444,9 +444,9 @@ void initializeAMDGPUExternalAAWrapperPass(PassRegistry&); void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); -ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); -void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); -extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; +ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(); +void initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(PassRegistry &); +extern char &AMDGPUOpenCLEnqueuedBlockLoweringLegacyID; void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 830b50307f837..f4e651ec477d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -464,8 +464,11 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { NewLd->setMetadata(LLVMContext::MD_range, nullptr); unsigned ShAmt = Adjust * 8; - auto *NewVal = IRB.CreateBitCast( - IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); + Value *NewVal = IRB.CreateBitCast( + IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), + DL.typeSizeEqualsStoreSize(LI.getType()) ? IntNTy + : LI.getType()), + LI.getType()); LI.replaceAllUsesWith(NewVal); DeadInsts.emplace_back(&LI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 4f5ca08b46c13..fbd15ad176e3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -31,6 +31,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPUOpenCLEnqueuedBlockLowering.h" #include "AMDGPU.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallString.h" @@ -48,11 +49,16 @@ using namespace llvm; namespace { /// Lower enqueued blocks. -class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { +class AMDGPUOpenCLEnqueuedBlockLowering { +public: + bool run(Module &M); +}; + +class AMDGPUOpenCLEnqueuedBlockLoweringLegacy : public ModulePass { public: static char ID; - explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {} + explicit AMDGPUOpenCLEnqueuedBlockLoweringLegacy() : ModulePass(ID) {} private: bool runOnModule(Module &M) override; @@ -60,19 +66,32 @@ class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { } // end anonymous namespace -char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0; +char AMDGPUOpenCLEnqueuedBlockLoweringLegacy::ID = 0; -char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID = - AMDGPUOpenCLEnqueuedBlockLowering::ID; +char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringLegacyID = + AMDGPUOpenCLEnqueuedBlockLoweringLegacy::ID; -INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, +INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLoweringLegacy, DEBUG_TYPE, "Lower OpenCL enqueued blocks", false, false) -ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { - return new AMDGPUOpenCLEnqueuedBlockLowering(); +ModulePass *llvm::createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass() { + return new AMDGPUOpenCLEnqueuedBlockLoweringLegacy(); +} + +bool AMDGPUOpenCLEnqueuedBlockLoweringLegacy::runOnModule(Module &M) { + AMDGPUOpenCLEnqueuedBlockLowering Impl; + return Impl.run(M); +} + +PreservedAnalyses +AMDGPUOpenCLEnqueuedBlockLoweringPass::run(Module &M, ModuleAnalysisManager &) { + AMDGPUOpenCLEnqueuedBlockLowering Impl; + if (Impl.run(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); } -bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { +bool AMDGPUOpenCLEnqueuedBlockLowering::run(Module &M) { DenseSet Callers; auto &C = M.getContext(); bool Changed = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h new file mode 100644 index 0000000000000..16ed7c18d8523 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h @@ -0,0 +1,23 @@ +//===- AMDGPUOpenCLEnqueuedBlockLowering.h -----------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { +class AMDGPUOpenCLEnqueuedBlockLoweringPass + : public PassInfoMixin { +public: + AMDGPUOpenCLEnqueuedBlockLoweringPass() = default; + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 182e825a59a41..6f322074ba74c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -21,11 +21,13 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers", AMDGPULowerBufferFatPointersPass(*this)) MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass()) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) +MODULE_PASS("amdgpu-lower-enqueued-block", AMDGPUOpenCLEnqueuedBlockLoweringPass()) MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this)) MODULE_PASS("amdgpu-perf-hint", AMDGPUPerfHintAnalysisPass( *static_cast(this))) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) +MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) #undef MODULE_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp index 3a87070a326c2..e2e5c57397d02 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -33,25 +34,16 @@ namespace { using Generation = AMDGPUSubtarget::Generation; -class AMDGPURemoveIncompatibleFunctions : public ModulePass { +class AMDGPURemoveIncompatibleFunctions { public: - static char ID; - AMDGPURemoveIncompatibleFunctions(const TargetMachine *TM = nullptr) - : ModulePass(ID), TM(TM) { + : TM(TM) { assert(TM && "No TargetMachine!"); } - - StringRef getPassName() const override { - return "AMDGPU Remove Incompatible Functions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override {} - /// Checks a single function, returns true if the function must be deleted. bool checkFunction(Function &F); - bool runOnModule(Module &M) override { + bool run(Module &M) { assert(TM->getTargetTriple().isAMDGCN()); SmallVector FnsToDelete; @@ -71,6 +63,28 @@ class AMDGPURemoveIncompatibleFunctions : public ModulePass { const TargetMachine *TM = nullptr; }; +class AMDGPURemoveIncompatibleFunctionsLegacy : public ModulePass { +public: + static char ID; + + AMDGPURemoveIncompatibleFunctionsLegacy(const TargetMachine *TM) + : ModulePass(ID), TM(TM) {} + + bool runOnModule(Module &M) override { + AMDGPURemoveIncompatibleFunctions Pass(TM); + return Pass.run(M); + } + + StringRef getPassName() const override { + return "AMDGPU Remove Incompatible Functions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override {} + +private: + const TargetMachine *TM = nullptr; +}; + StringRef getFeatureName(unsigned Feature) { for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) if (Feature == KV.Value) @@ -131,6 +145,15 @@ void reportFunctionRemoved(Function &F, unsigned Feature) { } } // end anonymous namespace +PreservedAnalyses +AMDGPURemoveIncompatibleFunctionsPass::run(Module &M, + ModuleAnalysisManager &MAM) { + AMDGPURemoveIncompatibleFunctions Impl(TM); + if (Impl.run(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) { if (F.isDeclaration()) return false; @@ -182,12 +205,12 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) { return false; } -INITIALIZE_PASS(AMDGPURemoveIncompatibleFunctions, DEBUG_TYPE, +INITIALIZE_PASS(AMDGPURemoveIncompatibleFunctionsLegacy, DEBUG_TYPE, "AMDGPU Remove Incompatible Functions", false, false) -char AMDGPURemoveIncompatibleFunctions::ID = 0; +char AMDGPURemoveIncompatibleFunctionsLegacy::ID = 0; ModulePass * llvm::createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *TM) { - return new AMDGPURemoveIncompatibleFunctions(TM); + return new AMDGPURemoveIncompatibleFunctionsLegacy(TM); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h new file mode 100644 index 0000000000000..e4c858588ece8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h @@ -0,0 +1,26 @@ +//===- AMDGPURemoveIncompatibleFunctions.h ----------------------*- C++- *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H +#define LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +class AMDGPURemoveIncompatibleFunctionsPass + : public PassInfoMixin { + const TargetMachine *TM; + +public: + AMDGPURemoveIncompatibleFunctionsPass(const TargetMachine &TM) : TM(&TM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0c9d7d00a8a4a..6d4547dbc82c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,7 +22,9 @@ #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPUMacroFusion.h" +#include "AMDGPUOpenCLEnqueuedBlockLowering.h" #include "AMDGPUPerfHintAnalysis.h" +#include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUSplitModule.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" @@ -48,6 +50,7 @@ #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/CodeGen/AtomicExpand.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" @@ -499,7 +502,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); - initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); + initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPURegBankCombinerPass(*PR); @@ -507,7 +510,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPULateCodeGenPrepareLegacyPass(*PR); - initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); + initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR); initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPULowerBufferFatPointersPass(*PR); initializeAMDGPUReserveWWMRegsPass(*PR); @@ -1173,7 +1176,7 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createR600OpenCLImageTypeLoweringPass()); // Replace OpenCL enqueued block function pointers with global variables. - addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + addPass(createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass()); // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) @@ -1925,7 +1928,8 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( } void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { - // TODO: Missing AMDGPURemoveIncompatibleFunctions + if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) + addPass(AMDGPURemoveIncompatibleFunctionsPass(TM)); addPass(AMDGPUPrintfRuntimeBindingPass()); if (LowerCtorDtor) @@ -1941,7 +1945,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUAlwaysInlinePass()); addPass(AlwaysInlinerPass()); - // TODO: Missing OpenCLEnqueuedBlockLowering + addPass(AMDGPUOpenCLEnqueuedBlockLoweringPass()); // Runs before PromoteAlloca so the latter can account for function uses if (EnableLowerModuleLDS) @@ -1955,8 +1959,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy)); - // FIXME: Adding atomic-expand manages to break -passes=atomic-expand - // addPass(AtomicExpandPass(TM)); + addPass(AtomicExpandPass(&TM)); if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(AMDGPUPromoteAllocaPass(TM)); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 4325ab448e581..cdc1132579d8d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3390,6 +3390,9 @@ let SubtargetPredicate = isGFX9Plus in { let True16Predicate = NotHasTrue16BitInsts in def : PackB32Pat; +let True16Predicate = UseRealTrue16Insts in + def : PackB32Pat; + let True16Predicate = UseFakeTrue16Insts in def : PackB32Pat; } // End SubtargetPredicate = isGFX9Plus diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 62b5b704e99eb..6fdd83c4dc877 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -909,6 +909,26 @@ def RawBufferLoad : DXILOp<139, rawBufferLoad> { let stages = [Stages]; } +def RawBufferStore : DXILOp<140, rawBufferStore> { + let Doc = "writes to a RWByteAddressBuffer or RWStructuredBuffer"; + // Handle, Coord0, Coord1, Val0, Val1, Val2, Val3, Mask, Alignment + let arguments = [ + HandleTy, Int32Ty, Int32Ty, OverloadTy, OverloadTy, OverloadTy, OverloadTy, + Int8Ty, Int32Ty + ]; + let result = VoidTy; + let overloads = [ + Overloads, + Overloads + ]; + let stages = [Stages]; +} + def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> { let Doc = "signed dot product of 4 x i8 vectors packed into i32, with " "accumulate to i32"; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index f43815bf21166..0c245c1a43d31 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -616,7 +616,10 @@ class OpLowerer { return false; } - [[nodiscard]] bool lowerTypedBufferStore(Function &F) { + [[nodiscard]] bool lowerBufferStore(Function &F, bool IsRaw) { + Triple TT(Triple(M.getTargetTriple())); + VersionTuple DXILVersion = TT.getDXILVersion(); + const DataLayout &DL = F.getDataLayout(); IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int8Ty = IRB.getInt8Ty(); Type *Int32Ty = IRB.getInt32Ty(); @@ -627,51 +630,75 @@ class OpLowerer { Value *Handle = createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType()); Value *Index0 = CI->getArgOperand(1); - Value *Index1 = UndefValue::get(Int32Ty); - // For typed stores, the mask must always cover all four elements. - Constant *Mask = ConstantInt::get(Int8Ty, 0xF); + Value *Index1 = IsRaw ? CI->getArgOperand(2) : UndefValue::get(Int32Ty); + + Value *Data = CI->getArgOperand(IsRaw ? 3 : 2); + Type *DataTy = Data->getType(); + Type *ScalarTy = DataTy->getScalarType(); - Value *Data = CI->getArgOperand(2); - auto *DataTy = dyn_cast(Data->getType()); - if (!DataTy || DataTy->getNumElements() != 4) + uint64_t NumElements = + DL.getTypeSizeInBits(DataTy) / DL.getTypeSizeInBits(ScalarTy); + Value *Mask = ConstantInt::get(Int8Ty, ~(~0U << NumElements)); + + // TODO: check that we only have vector or scalar... + if (!IsRaw && NumElements != 4) return make_error( "typedBufferStore data must be a vector of 4 elements", inconvertibleErrorCode()); + else if (NumElements > 4) + return make_error( + "rawBufferStore data must have at most 4 elements", + inconvertibleErrorCode()); - // Since we're post-scalarizer, we likely have a vector that's constructed - // solely for the argument of the store. If so, just use the scalar values - // from before they're inserted into the temporary. std::array DataElements{nullptr, nullptr, nullptr, nullptr}; - auto *IEI = dyn_cast(Data); - while (IEI) { - auto *IndexOp = dyn_cast(IEI->getOperand(2)); - if (!IndexOp) - break; - size_t IndexVal = IndexOp->getZExtValue(); - assert(IndexVal < 4 && "Too many elements for buffer store"); - DataElements[IndexVal] = IEI->getOperand(1); - IEI = dyn_cast(IEI->getOperand(0)); + if (DataTy == ScalarTy) + DataElements[0] = Data; + else { + // Since we're post-scalarizer, if we see a vector here it's likely + // constructed solely for the argument of the store. Just use the scalar + // values from before they're inserted into the temporary. + auto *IEI = dyn_cast(Data); + while (IEI) { + auto *IndexOp = dyn_cast(IEI->getOperand(2)); + if (!IndexOp) + break; + size_t IndexVal = IndexOp->getZExtValue(); + assert(IndexVal < 4 && "Too many elements for buffer store"); + DataElements[IndexVal] = IEI->getOperand(1); + IEI = dyn_cast(IEI->getOperand(0)); + } } // If for some reason we weren't able to forward the arguments from the - // scalarizer artifact, then we need to actually extract elements from the - // vector. - for (int I = 0, E = 4; I != E; ++I) + // scalarizer artifact, then we may need to actually extract elements from + // the vector. + for (int I = 0, E = NumElements; I < E; ++I) if (DataElements[I] == nullptr) DataElements[I] = IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, I)); + // For any elements beyond the length of the vector, fill up with undef. + for (int I = NumElements, E = 4; I < E; ++I) + if (DataElements[I] == nullptr) + DataElements[I] = UndefValue::get(ScalarTy); - std::array Args{ + dxil::OpCode Op = OpCode::BufferStore; + SmallVector Args{ Handle, Index0, Index1, DataElements[0], DataElements[1], DataElements[2], DataElements[3], Mask}; + if (IsRaw && DXILVersion >= VersionTuple(1, 2)) { + Op = OpCode::RawBufferStore; + // RawBufferStore requires the alignment + Args.push_back( + ConstantInt::get(Int32Ty, DL.getPrefTypeAlign(ScalarTy).value())); + } Expected OpCall = - OpBuilder.tryCreateOp(OpCode::BufferStore, Args, CI->getName()); + OpBuilder.tryCreateOp(Op, Args, CI->getName()); if (Error E = OpCall.takeError()) return E; CI->eraseFromParent(); // Clean up any leftover `insertelement`s - IEI = dyn_cast(Data); + auto *IEI = dyn_cast(Data); while (IEI && IEI->use_empty()) { InsertElementInst *Tmp = IEI; IEI = dyn_cast(IEI->getOperand(0)); @@ -776,11 +803,14 @@ class OpLowerer { HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true); break; case Intrinsic::dx_resource_store_typedbuffer: - HasErrors |= lowerTypedBufferStore(F); + HasErrors |= lowerBufferStore(F, /*IsRaw=*/false); break; case Intrinsic::dx_resource_load_rawbuffer: HasErrors |= lowerRawBufferLoad(F); break; + case Intrinsic::dx_resource_store_rawbuffer: + HasErrors |= lowerBufferStore(F, /*IsRaw=*/true); + break; case Intrinsic::dx_resource_updatecounter: HasErrors |= lowerUpdateCounter(F); break; diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 5afe6b2d2883d..5fd5c226eef89 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -15,12 +15,14 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/Analysis/DXILResource.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" @@ -300,6 +302,38 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD, return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx); } +// TODO: We might need to refactor this to be more generic, +// in case we need more metadata to be replaced. +static void translateBranchMetadata(Module &M) { + for (Function &F : M) { + for (BasicBlock &BB : F) { + Instruction *BBTerminatorInst = BB.getTerminator(); + + MDNode *HlslControlFlowMD = + BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); + + if (!HlslControlFlowMD) + continue; + + assert(HlslControlFlowMD->getNumOperands() == 2 && + "invalid operands for hlsl.controlflow.hint"); + + MDBuilder MDHelper(M.getContext()); + ConstantInt *Op1 = + mdconst::extract(HlslControlFlowMD->getOperand(1)); + + SmallVector Vals( + ArrayRef{MDHelper.createString("dx.controlflow.hints"), + MDHelper.createConstant(Op1)}); + + MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals); + + BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); + BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); + } + } +} + static void translateMetadata(Module &M, DXILBindingMap &DBM, DXILResourceTypeMap &DRTM, const Resources &MDResources, @@ -372,6 +406,7 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M, const dxil::ModuleMetadataInfo MMDI = MAM.getResult(M); translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI); + translateBranchMetadata(M); return PreservedAnalyses::all(); } @@ -409,6 +444,7 @@ class DXILTranslateMetadataLegacy : public ModulePass { getAnalysis().getModuleMetadata(); translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI); + translateBranchMetadata(M); return true; } }; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index c3e72d6ce3a3f..6a95d9ebef6c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -725,6 +725,23 @@ let hasSideEffects = false in { def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">; def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">; + + // Float to TF32 conversions + multiclass CVT_TO_TF32 Preds = [hasPTX<78>, hasSM<90>]> { + defvar Intr = !cast("int_nvvm_f2tf32_" # !subst(".", "_", Modifier)); + + def NAME : NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src), + "cvt." # Modifier # ".tf32.f32 \t$dst, $src;", + [(set i32:$dst, (Intr f32:$src))]>, + Requires; + } + + defm CVT_to_tf32_rn : CVT_TO_TF32<"rn">; + defm CVT_to_tf32_rz : CVT_TO_TF32<"rz">; + defm CVT_to_tf32_rn_relu : CVT_TO_TF32<"rn.relu">; + defm CVT_to_tf32_rz_relu : CVT_TO_TF32<"rz.relu">; + defm CVT_to_tf32_rna : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>; + defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>; } def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{ diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 22339ebc5484f..4f144cc641080 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1722,11 +1722,6 @@ def : Pat<(int_nvvm_f2bf16_rz f32:$a), def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a), (CVT_bf16_f32 $a, CvtRZ_RELU)>; -def CVT_tf32_f32 : - NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a), - "cvt.rna.tf32.f32 \t$dest, $a;", - [(set i32:$dest, (int_nvvm_f2tf32_rna f32:$a))]>; - def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};", Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>; diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 2205c67c2d21b..8177280044bf4 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -717,6 +717,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isUImm6() const { return IsUImm<6>(); } bool isUImm7() const { return IsUImm<7>(); } bool isUImm8() const { return IsUImm<8>(); } + bool isUImm10() const { return IsUImm<10>(); } bool isUImm11() const { return IsUImm<11>(); } bool isUImm16() const { return IsUImm<16>(); } bool isUImm20() const { return IsUImm<20>(); } @@ -1590,6 +1591,8 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 9), (1 << 9) - 16, "immediate must be a multiple of 16 bytes and non-zero in the range"); + case Match_InvalidUImm10: + return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 10) - 1); case Match_InvalidUImm11: return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 11) - 1); case Match_InvalidSImm12: diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index a490910154eb4..971ef90c63327 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -700,6 +700,8 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size, "Qualcomm uC Conditional Load Immediate custom opcode table"); TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicm, DecoderTableXqcicm32, "Qualcomm uC Conditional Move custom opcode table"); + TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciint, DecoderTableXqciint32, + "Qualcomm uC Interrupts custom opcode table"); TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table"); return MCDisassembler::Fail; @@ -732,6 +734,8 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size, TRY_TO_DECODE_FEATURE( RISCV::FeatureVendorXqcicm, DecoderTableXqcicm16, "Qualcomm uC Conditional Move custom 16bit opcode table"); + TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciint, DecoderTableXqciint16, + "Qualcomm uC Interrupts custom 16bit opcode table"); TRY_TO_DECODE_AND_ADD_SP(STI.hasFeature(RISCV::FeatureVendorXwchc), DecoderTableXwchc16, "WCH QingKe XW custom opcode table"); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 7048e40822342..ab04b09a7ad15 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -313,6 +313,7 @@ enum OperandType : unsigned { OPERAND_UIMM8_LSB000, OPERAND_UIMM8_GE32, OPERAND_UIMM9_LSB000, + OPERAND_UIMM10, OPERAND_UIMM10_LSB00_NONZERO, OPERAND_UIMM11, OPERAND_UIMM12, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 01bc5387e672e..f721d7148526b 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1302,6 +1302,14 @@ def HasVendorXqcicm AssemblerPredicate<(all_of FeatureVendorXqcicm), "'Xqcicm' (Qualcomm uC Conditional Move Extension)">; +def FeatureVendorXqciint + : RISCVExperimentalExtension<0, 2, "Qualcomm uC Interrupts Extension", + [FeatureStdExtZca]>; +def HasVendorXqciint + : Predicate<"Subtarget->hasVendorXqciint()">, + AssemblerPredicate<(all_of FeatureVendorXqciint), + "'Xqciint' (Qualcomm uC Interrupts Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index f24940795e433..1f7e8d87a11b0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2473,6 +2473,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, CASE_OPERAND_UIMM(6) CASE_OPERAND_UIMM(7) CASE_OPERAND_UIMM(8) + CASE_OPERAND_UIMM(10) CASE_OPERAND_UIMM(12) CASE_OPERAND_UIMM(20) // clang-format on diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 6f15646852f91..ce8c0c0a3d4e5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -28,6 +28,8 @@ def uimm5gt3 : RISCVOp, ImmLeaf; + def uimm11 : RISCVUImmLeafOp<11>; //===----------------------------------------------------------------------===// @@ -166,6 +168,36 @@ class QCIMVCCI funct3, string opcodestr, DAGOperand immType> let rs2 = imm; } +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +class QCIRVInst16CI_RS1 funct5, string OpcodeStr> + : RVInst16CI<0b000, 0b10, (outs), (ins GPRNoX0:$rs1), OpcodeStr, "$rs1"> { + bits<5> rs1; + + let Inst{12} = 0b1; + let Inst{11-7} = rs1; + let Inst{6-2} = funct5{4-0}; +} + +let hasSideEffects = 1 in +class QCIRVInst16CI_NONE funct5, string OpcodeStr> + : RVInst16CI<0b000, 0b10, (outs), (ins), OpcodeStr, ""> { + let Inst{12} = 0b1; + let Inst{11-7} = funct5; + let Inst{6-2} = 0b00100; +} + +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +class QCIInt_IMM funct1, string opcodestr> + : RVInstIBase<0b000, OPC_SYSTEM, (outs), (ins uimm10:$imm10), opcodestr, + "$imm10"> { + bits<10> imm10; + + let rd = 0; + let rs1 = imm10{4-0}; + let Inst{31-25} = {0b110011, funct1}; + let Inst{24-20} = imm10{9-5}; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -312,6 +344,38 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def QC_MVGEUI : QCIMVCCI<0b111, "qc.mvgeui", uimm5>; } // Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm" +let Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" in { + let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in + def QC_C_DIR : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd), (ins), + "qc.c.dir", "$rd"> { + bits<5> rd; + + let Inst{12} = 0b1; + let Inst{11-7} = rd; + let Inst{6-2} = 0b00000; + } + + def QC_SETINTI : QCIInt_IMM<0b0, "qc.setinti">; + def QC_CLRINTI : QCIInt_IMM<0b1, "qc.clrinti">; + + def QC_C_EIR : QCIRVInst16CI_RS1<0b00001, "qc.c.eir">; + def QC_C_SETINT : QCIRVInst16CI_RS1<0b00010, "qc.c.setint">; + def QC_C_CLRINT : QCIRVInst16CI_RS1<0b00011, "qc.c.clrint">; + + let mayLoad = 0, mayStore = 0 in { + def QC_C_DI : QCIRVInst16CI_NONE<0b10110, "qc.c.di">; + def QC_C_EI : QCIRVInst16CI_NONE<0b10111, "qc.c.ei">; + } // mayLoad =0, mayStore = 0 + + let mayLoad = 1, mayStore = 1 in { + def QC_C_MIENTER : QCIRVInst16CI_NONE<0b10000, "qc.c.mienter">; + def QC_C_MIENTER_NEST : QCIRVInst16CI_NONE<0b10001, "qc.c.mienter.nest">; + } // mayLoad = 1, mayStore = 1 + + let mayLoad = 1, mayStore = 1, isReturn = 1, isTerminator = 1 in + def QC_C_MILEAVERET : QCIRVInst16CI_NONE<0b10100, "qc.c.mileaveret">; +} // Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" + //===----------------------------------------------------------------------===// // Aliases //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp index cc6daf7ef3442..c23a6c3e8bbe8 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp @@ -56,20 +56,12 @@ getConvergenceTokenInternal(BasicBlockType *BB) { "Output type must be an intrinsic instruction."); for (auto &I : *BB) { - if (auto *II = dyn_cast(&I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::experimental_convergence_entry: - case Intrinsic::experimental_convergence_loop: - return II; - case Intrinsic::experimental_convergence_anchor: { - auto Bundle = II->getOperandBundle(LLVMContext::OB_convergencectrl); - assert(Bundle->Inputs.size() == 1 && - Bundle->Inputs[0]->getType()->isTokenTy()); - auto TII = dyn_cast(Bundle->Inputs[0].get()); - assert(TII != nullptr); - return TII; - } - } + if (auto *CI = dyn_cast(&I)) { + // Make sure that the anchor or entry intrinsics did not reach here with a + // parent token. This should have failed the verifier. + assert(CI->isLoop() || + !CI->getOperandBundle(LLVMContext::OB_convergencectrl)); + return CI; } if (auto *CI = dyn_cast(&I)) { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 28c9b81db51f5..1d6be7619ecf4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/IntrinsicsSPIRV.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #define DEBUG_TYPE "spirv-isel" @@ -45,6 +46,17 @@ using ExtInstList = namespace { +llvm::SPIRV::SelectionControl::SelectionControl +getSelectionOperandForImm(int Imm) { + if (Imm == 2) + return SPIRV::SelectionControl::Flatten; + if (Imm == 1) + return SPIRV::SelectionControl::DontFlatten; + if (Imm == 0) + return SPIRV::SelectionControl::None; + llvm_unreachable("Invalid immediate"); +} + #define GET_GLOBALISEL_PREDICATE_BITSET #include "SPIRVGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATE_BITSET @@ -274,10 +286,10 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const; - void selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType, + bool selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const; - void selectImageWriteIntrinsic(MachineInstr &I) const; + bool selectImageWriteIntrinsic(MachineInstr &I) const; // Utilities std::pair @@ -305,7 +317,7 @@ class SPIRVInstructionSelector : public InstructionSelector { Register IndexReg, bool IsNonUniform, MachineIRBuilder MIRBuilder) const; SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const; - void extractSubvector(Register &ResVReg, const SPIRVType *ResType, + bool extractSubvector(Register &ResVReg, const SPIRVType *ResType, Register &ReadReg, MachineInstr &InsertionPoint) const; bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const; bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue, @@ -2818,12 +2830,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, } return MIB.constrainAllUses(TII, TRI, RBI); } - case Intrinsic::spv_loop_merge: - case Intrinsic::spv_selection_merge: { - const auto Opcode = IID == Intrinsic::spv_selection_merge - ? SPIRV::OpSelectionMerge - : SPIRV::OpLoopMerge; - auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode)); + case Intrinsic::spv_loop_merge: { + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoopMerge)); for (unsigned i = 1; i < I.getNumExplicitOperands(); ++i) { assert(I.getOperand(i).isMBB()); MIB.addMBB(I.getOperand(i).getMBB()); @@ -2831,6 +2839,15 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, MIB.addImm(SPIRV::SelectionControl::None); return MIB.constrainAllUses(TII, TRI, RBI); } + case Intrinsic::spv_selection_merge: { + auto MIB = + BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSelectionMerge)); + assert(I.getOperand(1).isMBB() && + "operand 1 to spv_selection_merge must be a basic block"); + MIB.addMBB(I.getOperand(1).getMBB()); + MIB.addImm(getSelectionOperandForImm(I.getOperand(2).getImm())); + return MIB.constrainAllUses(TII, TRI, RBI); + } case Intrinsic::spv_cmpxchg: return selectAtomicCmpXchg(ResVReg, ResType, I); case Intrinsic::spv_unreachable: @@ -3002,12 +3019,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectHandleFromBinding(ResVReg, ResType, I); } case Intrinsic::spv_resource_store_typedbuffer: { - selectImageWriteIntrinsic(I); - return true; + return selectImageWriteIntrinsic(I); } case Intrinsic::spv_resource_load_typedbuffer: { - selectReadImageIntrinsic(ResVReg, ResType, I); - return true; + return selectReadImageIntrinsic(ResVReg, ResType, I); } case Intrinsic::spv_discard: { return selectDiscard(ResVReg, ResType, I); @@ -3049,7 +3064,7 @@ bool SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg, .constrainAllUses(TII, TRI, RBI); } -void SPIRVInstructionSelector::selectReadImageIntrinsic( +bool SPIRVInstructionSelector::selectReadImageIntrinsic( Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const { // If the load of the image is in a different basic block, then @@ -3064,35 +3079,40 @@ void SPIRVInstructionSelector::selectReadImageIntrinsic( uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType); if (ResultSize == 4) { - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead)) + return BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpImageRead)) .addDef(ResVReg) .addUse(GR.getSPIRVTypeID(ResType)) .addUse(ImageReg) - .addUse(I.getOperand(3).getReg()); - return; + .addUse(I.getOperand(3).getReg()) + .constrainAllUses(TII, TRI, RBI); } SPIRVType *ReadType = widenTypeToVec4(ResType, I); Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType)); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead)) - .addDef(ReadReg) - .addUse(GR.getSPIRVTypeID(ReadType)) - .addUse(ImageReg) - .addUse(I.getOperand(3).getReg()); + bool Succeed = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead)) + .addDef(ReadReg) + .addUse(GR.getSPIRVTypeID(ReadType)) + .addUse(ImageReg) + .addUse(I.getOperand(3).getReg()) + .constrainAllUses(TII, TRI, RBI); + if (!Succeed) + return false; if (ResultSize == 1) { - BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(SPIRV::OpCompositeExtract)) + return BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpCompositeExtract)) .addDef(ResVReg) .addUse(GR.getSPIRVTypeID(ResType)) .addUse(ReadReg) - .addImm(0); - return; + .addImm(0) + .constrainAllUses(TII, TRI, RBI); } - extractSubvector(ResVReg, ResType, ReadReg, I); + return extractSubvector(ResVReg, ResType, ReadReg, I); } -void SPIRVInstructionSelector::extractSubvector( +bool SPIRVInstructionSelector::extractSubvector( Register &ResVReg, const SPIRVType *ResType, Register &ReadReg, MachineInstr &InsertionPoint) const { SPIRVType *InputType = GR.getResultType(ReadReg); @@ -3108,12 +3128,16 @@ void SPIRVInstructionSelector::extractSubvector( const TargetRegisterClass *ScalarRegClass = GR.getRegClass(ScalarType); for (uint64_t I = 0; I < ResultSize; I++) { Register ComponentReg = MRI->createVirtualRegister(ScalarRegClass); - BuildMI(*InsertionPoint.getParent(), InsertionPoint, - InsertionPoint.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract)) - .addDef(ComponentReg) - .addUse(ScalarType->getOperand(0).getReg()) - .addUse(ReadReg) - .addImm(I); + bool Succeed = BuildMI(*InsertionPoint.getParent(), InsertionPoint, + InsertionPoint.getDebugLoc(), + TII.get(SPIRV::OpCompositeExtract)) + .addDef(ComponentReg) + .addUse(ScalarType->getOperand(0).getReg()) + .addUse(ReadReg) + .addImm(I) + .constrainAllUses(TII, TRI, RBI); + if (!Succeed) + return false; ComponentRegisters.emplace_back(ComponentReg); } @@ -3125,9 +3149,10 @@ void SPIRVInstructionSelector::extractSubvector( for (Register ComponentReg : ComponentRegisters) MIB.addUse(ComponentReg); + return MIB.constrainAllUses(TII, TRI, RBI); } -void SPIRVInstructionSelector::selectImageWriteIntrinsic( +bool SPIRVInstructionSelector::selectImageWriteIntrinsic( MachineInstr &I) const { // If the load of the image is in a different basic block, then // this will generate invalid code. A proper solution is to move @@ -3142,10 +3167,12 @@ void SPIRVInstructionSelector::selectImageWriteIntrinsic( Register DataReg = I.getOperand(3).getReg(); assert(GR.getResultType(DataReg)->getOpcode() == SPIRV::OpTypeVector); assert(GR.getScalarOrVectorComponentCount(GR.getResultType(DataReg)) == 4); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageWrite)) + return BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpImageWrite)) .addUse(ImageReg) .addUse(CoordinateReg) - .addUse(DataReg); + .addUse(DataReg) + .constrainAllUses(TII, TRI, RBI); } Register SPIRVInstructionSelector::buildPointerToResource( diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 336cde4e78224..2e4343c7922f1 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -18,14 +18,16 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" -#include "llvm/IR/Analysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsSPIRV.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/InitializePasses.h" +#include "llvm/PassRegistry.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" @@ -646,8 +648,7 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(BB.getParent(), &BB); - SmallVector Args = {MergeAddress}; - Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); + createOpSelectMerge(&Builder, MergeAddress); Modified = true; } @@ -769,10 +770,9 @@ class SPIRVStructurizer : public FunctionPass { BasicBlock *Merge = Candidates[0]; auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge); - SmallVector Args = {MergeAddress}; IRBuilder<> Builder(&BB); Builder.SetInsertPoint(BB.getTerminator()); - Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); + createOpSelectMerge(&Builder, MergeAddress); } return Modified; @@ -1105,8 +1105,7 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge); - SmallVector Args = {MergeAddress}; - Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); + createOpSelectMerge(&Builder, MergeAddress); continue; } @@ -1120,8 +1119,7 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(NewMerge->getParent(), NewMerge); - SmallVector Args = {MergeAddress}; - Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); + createOpSelectMerge(&Builder, MergeAddress); } return Modified; @@ -1208,6 +1206,27 @@ class SPIRVStructurizer : public FunctionPass { AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } + + void createOpSelectMerge(IRBuilder<> *Builder, BlockAddress *MergeAddress) { + Instruction *BBTerminatorInst = Builder->GetInsertBlock()->getTerminator(); + + MDNode *MDNode = BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); + + ConstantInt *BranchHint = llvm::ConstantInt::get(Builder->getInt32Ty(), 0); + + if (MDNode) { + assert(MDNode->getNumOperands() == 2 && + "invalid metadata hlsl.controlflow.hint"); + BranchHint = mdconst::extract(MDNode->getOperand(1)); + + assert(BranchHint && "invalid metadata value for hlsl.controlflow.hint"); + } + + llvm::SmallVector Args = {MergeAddress, BranchHint}; + + Builder->CreateIntrinsic(Intrinsic::spv_selection_merge, + {MergeAddress->getType()}, {Args}); + } }; } // namespace llvm @@ -1229,8 +1248,11 @@ FunctionPass *llvm::createSPIRVStructurizerPass() { PreservedAnalyses SPIRVStructurizerWrapper::run(Function &F, FunctionAnalysisManager &AF) { - FunctionPass *StructurizerPass = createSPIRVStructurizerPass(); - if (!StructurizerPass->runOnFunction(F)) + + auto FPM = legacy::FunctionPassManager(F.getParent()); + FPM.add(createSPIRVStructurizerPass()); + + if (!FPM.run(F)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet(); diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 07a00af881afe..d715fd1903802 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -340,46 +340,38 @@ struct X86Operand final : public MCParsedAsmOperand { return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR; } + bool isMem32_RC128() const { + return isMem32() && isMemIndexReg(X86::XMM0, X86::XMM15); + } bool isMem64_RC128() const { return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM15); } - bool isMem128_RC128() const { - return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM15); - } - bool isMem128_RC256() const { - return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM15); + bool isMem32_RC256() const { + return isMem32() && isMemIndexReg(X86::YMM0, X86::YMM15); } - bool isMem256_RC128() const { - return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM15); - } - bool isMem256_RC256() const { - return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM15); + bool isMem64_RC256() const { + return isMem64() && isMemIndexReg(X86::YMM0, X86::YMM15); } + bool isMem32_RC128X() const { + return isMem32() && X86II::isXMMReg(Mem.IndexReg); + } bool isMem64_RC128X() const { return isMem64() && X86II::isXMMReg(Mem.IndexReg); } - bool isMem128_RC128X() const { - return isMem128() && X86II::isXMMReg(Mem.IndexReg); + bool isMem32_RC256X() const { + return isMem32() && X86II::isYMMReg(Mem.IndexReg); } - bool isMem128_RC256X() const { - return isMem128() && X86II::isYMMReg(Mem.IndexReg); + bool isMem64_RC256X() const { + return isMem64() && X86II::isYMMReg(Mem.IndexReg); } - bool isMem256_RC128X() const { - return isMem256() && X86II::isXMMReg(Mem.IndexReg); + bool isMem32_RC512() const { + return isMem32() && X86II::isZMMReg(Mem.IndexReg); } - bool isMem256_RC256X() const { - return isMem256() && X86II::isYMMReg(Mem.IndexReg); - } - bool isMem256_RC512() const { - return isMem256() && X86II::isZMMReg(Mem.IndexReg); - } - bool isMem512_RC256X() const { - return isMem512() && X86II::isYMMReg(Mem.IndexReg); - } - bool isMem512_RC512() const { - return isMem512() && X86II::isZMMReg(Mem.IndexReg); + bool isMem64_RC512() const { + return isMem64() && X86II::isZMMReg(Mem.IndexReg); } + bool isMem512_GR16() const { if (!isMem512()) return false; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 596139d084570..add51fac4b9e6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41701,6 +41701,11 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, return SDValue(); } +static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, + ArrayRef Ops, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget); + /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, @@ -42401,25 +42406,17 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, return SDValue(); } case X86ISD::VPERMV3: { - SDValue V1 = peekThroughBitcasts(N.getOperand(0)); - SDValue V2 = peekThroughBitcasts(N.getOperand(2)); - MVT SVT = V1.getSimpleValueType(); - // Combine VPERMV3 to widened VPERMV if the two source operands are split - // from the same vector. - if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR && - V1.getConstantOperandVal(1) == 0 && - V2.getOpcode() == ISD::EXTRACT_SUBVECTOR && - V2.getConstantOperandVal(1) == SVT.getVectorNumElements() && - V1.getOperand(0) == V2.getOperand(0)) { - EVT NVT = V1.getOperand(0).getValueType(); - if (NVT.is256BitVector() || - (NVT.is512BitVector() && Subtarget.hasEVEX512())) { - MVT WideVT = MVT::getVectorVT( - VT.getScalarType(), NVT.getSizeInBits() / VT.getScalarSizeInBits()); + // Combine VPERMV3 to widened VPERMV if the two source operands can be + // freely concatenated. + if (VT.is128BitVector() || + (VT.is256BitVector() && Subtarget.useAVX512Regs())) { + SDValue Ops[] = {N.getOperand(0), N.getOperand(2)}; + MVT WideVT = VT.getDoubleNumVectorElementsVT(); + if (SDValue ConcatSrc = + combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) { SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG, DL, WideVT.getSizeInBits()); - SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, - DAG.getBitcast(WideVT, V1.getOperand(0))); + SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, DAG.getIntPtrConstant(0, DL)); } @@ -42427,6 +42424,9 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SmallVector Ops; SmallVector Mask; if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) { + assert(Mask.size() == NumElts && "Unexpected shuffle mask size"); + SDValue V1 = peekThroughBitcasts(N.getOperand(0)); + SDValue V2 = peekThroughBitcasts(N.getOperand(2)); MVT MaskVT = N.getOperand(1).getSimpleValueType(); // Canonicalize to VPERMV if both sources are the same. if (V1 == V2) { @@ -57369,10 +57369,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, Op0.getOperand(1)); } - // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128. - // Only concat of subvector high halves which vperm2x128 is best at. // TODO: This should go in combineX86ShufflesRecursively eventually. - if (VT.is256BitVector() && NumOps == 2) { + if (NumOps == 2) { SDValue Src0 = peekThroughBitcasts(Ops[0]); SDValue Src1 = peekThroughBitcasts(Ops[1]); if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR && @@ -57381,7 +57379,10 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, EVT SrcVT1 = Src1.getOperand(0).getValueType(); unsigned NumSrcElts0 = SrcVT0.getVectorNumElements(); unsigned NumSrcElts1 = SrcVT1.getVectorNumElements(); - if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() && + // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128. + // Only concat of subvector high halves which vperm2x128 is best at. + if (VT.is256BitVector() && SrcVT0.is256BitVector() && + SrcVT1.is256BitVector() && Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) && Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) { return DAG.getNode(X86ISD::VPERM2X128, DL, VT, @@ -57389,6 +57390,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, DAG.getBitcast(VT, Src1.getOperand(0)), DAG.getTargetConstant(0x31, DL, MVT::i8)); } + // concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x. + if (Src0.getOperand(0) == Src1.getOperand(0) && + Src0.getConstantOperandAPInt(1) == 0 && + Src1.getConstantOperandAPInt(1) == + Src0.getValueType().getVectorNumElements()) { + return DAG.getBitcast(VT, extractSubVector(Src0.getOperand(0), 0, DAG, + DL, VT.getSizeInBits())); + } } } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index abf016000fc8e..9d8c123185a7c 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -10279,36 +10279,36 @@ multiclass avx512_gather opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_gather_q_pd dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { defm NAME#D#SUFF#Z: avx512_gather, EVEX_V512, REX_W; + vy64xmem>, EVEX_V512, REX_W; defm NAME#Q#SUFF#Z: avx512_gather, EVEX_V512, REX_W; + vz64mem>, EVEX_V512, REX_W; let Predicates = [HasVLX] in { defm NAME#D#SUFF#Z256: avx512_gather, EVEX_V256, REX_W; + vx64xmem>, EVEX_V256, REX_W; defm NAME#Q#SUFF#Z256: avx512_gather, EVEX_V256, REX_W; + vy64xmem>, EVEX_V256, REX_W; defm NAME#D#SUFF#Z128: avx512_gather, EVEX_V128, REX_W; + vx64xmem>, EVEX_V128, REX_W; defm NAME#Q#SUFF#Z128: avx512_gather, EVEX_V128, REX_W; + vx64xmem>, EVEX_V128, REX_W; } } multiclass avx512_gather_d_ps dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { - defm NAME#D#SUFF#Z: avx512_gather, + defm NAME#D#SUFF#Z: avx512_gather, EVEX_V512; - defm NAME#Q#SUFF#Z: avx512_gather, + defm NAME#Q#SUFF#Z: avx512_gather, EVEX_V512; let Predicates = [HasVLX] in { defm NAME#D#SUFF#Z256: avx512_gather, EVEX_V256; + vy32xmem>, EVEX_V256; defm NAME#Q#SUFF#Z256: avx512_gather, EVEX_V256; + vy32xmem>, EVEX_V256; defm NAME#D#SUFF#Z128: avx512_gather, EVEX_V128; + vx32xmem>, EVEX_V128; defm NAME#Q#SUFF#Z128: avx512_gather, EVEX_V128; + vx32xmem, VK2WM>, EVEX_V128; } } @@ -10336,36 +10336,36 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain, multiclass avx512_scatter_q_pd dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { defm NAME#D#SUFF#Z: avx512_scatter, EVEX_V512, REX_W; + vy64xmem>, EVEX_V512, REX_W; defm NAME#Q#SUFF#Z: avx512_scatter, EVEX_V512, REX_W; + vz64mem>, EVEX_V512, REX_W; let Predicates = [HasVLX] in { defm NAME#D#SUFF#Z256: avx512_scatter, EVEX_V256, REX_W; + vx64xmem>, EVEX_V256, REX_W; defm NAME#Q#SUFF#Z256: avx512_scatter, EVEX_V256, REX_W; + vy64xmem>, EVEX_V256, REX_W; defm NAME#D#SUFF#Z128: avx512_scatter, EVEX_V128, REX_W; + vx64xmem>, EVEX_V128, REX_W; defm NAME#Q#SUFF#Z128: avx512_scatter, EVEX_V128, REX_W; + vx64xmem>, EVEX_V128, REX_W; } } multiclass avx512_scatter_d_ps dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { - defm NAME#D#SUFF#Z: avx512_scatter, + defm NAME#D#SUFF#Z: avx512_scatter, EVEX_V512; - defm NAME#Q#SUFF#Z: avx512_scatter, + defm NAME#Q#SUFF#Z: avx512_scatter, EVEX_V512; let Predicates = [HasVLX] in { defm NAME#D#SUFF#Z256: avx512_scatter, EVEX_V256; + vy32xmem>, EVEX_V256; defm NAME#Q#SUFF#Z256: avx512_scatter, EVEX_V256; + vy32xmem>, EVEX_V256; defm NAME#D#SUFF#Z128: avx512_scatter, EVEX_V128; + vx32xmem>, EVEX_V128; defm NAME#Q#SUFF#Z128: avx512_scatter, EVEX_V128; + vx32xmem, VK2WM>, EVEX_V128; } } @@ -10385,52 +10385,52 @@ multiclass avx512_gather_scatter_prefetch opc, Format F, string OpcodeSt } defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps", - VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps", - VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", - VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", - VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", - VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps", - VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd", - VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd", - VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps", - VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps", - VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd", - VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd", - VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps", - VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps", - VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd", - VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", - VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; multiclass cvt_by_vec_width opc, X86VectorVTInfo Vec, string OpcodeStr, SchedWrite Sched> { def rk : AVX512XS8I; def i512mem_GR64 : X86MemOperand<"printzmmwordmem", X86Mem512_GR64Operand, 512>; // Gather mem operands +def vx32mem : X86VMemOperand; def vx64mem : X86VMemOperand; -def vx128mem : X86VMemOperand; -def vx256mem : X86VMemOperand; -def vy128mem : X86VMemOperand; -def vy256mem : X86VMemOperand; +def vy32mem : X86VMemOperand; +def vy64mem : X86VMemOperand; +def vx32xmem : X86VMemOperand; def vx64xmem : X86VMemOperand; -def vx128xmem : X86VMemOperand; -def vx256xmem : X86VMemOperand; -def vy128xmem : X86VMemOperand; -def vy256xmem : X86VMemOperand; -def vy512xmem : X86VMemOperand; -def vz256mem : X86VMemOperand; -def vz512mem : X86VMemOperand; +def vy32xmem : X86VMemOperand; +def vy64xmem : X86VMemOperand; +def vz32mem : X86VMemOperand; +def vz64mem : X86VMemOperand; def shmem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 036d7d92f3f89..6aadb788c851e 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8078,26 +8078,26 @@ let Predicates = [HasAVX2] in { = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" in { defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", - VR256, vx128mem, vx256mem>, REX_W; + VR256, vx64mem, vx64mem>, REX_W; defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", - VR256, vx128mem, vy256mem>, REX_W; + VR256, vx64mem, vy64mem>, REX_W; defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", - VR256, vx128mem, vy256mem>; + VR256, vx32mem, vy32mem>; defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", - VR128, vx64mem, vy128mem>; + VR128, vx32mem, vy32mem>; let ExeDomain = SSEPackedDouble in { defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", - VR256, vx128mem, vx256mem>, REX_W; + VR256, vx64mem, vx64mem>, REX_W; defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", - VR256, vx128mem, vy256mem>, REX_W; + VR256, vx64mem, vy64mem>, REX_W; } let ExeDomain = SSEPackedSingle in { defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", - VR256, vx128mem, vy256mem>; + VR256, vx32mem, vy32mem>; defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", - VR128, vx64mem, vy128mem>; + VR128, vx32mem, vy32mem>; } } } diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 7d0b8c333f72f..34ca03a47e0a4 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -276,6 +276,11 @@ void AArch64::ExtensionSet::disable(ArchExtKind E) { if (E == AEK_SVE2AES) disable(AEK_SVEAES); + if (E == AEK_SVE2BITPERM){ + disable(AEK_SVEBITPERM); + disable(AEK_SVE2); + } + if (!Enabled.test(E)) return; diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index d6e1eac0d85af..1995931abfe41 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -742,8 +742,8 @@ Error RISCVISAInfo::checkDependency() { bool HasZvl = MinVLen != 0; bool HasZcmt = Exts.count("zcmt") != 0; static constexpr StringLiteral XqciExts[] = { - {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcicm"}, - {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}}; + {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcicm"}, {"xqcics"}, + {"xqcicsr"}, {"xqciint"}, {"xqcilsm"}, {"xqcisls"}}; if (HasI && HasE) return getIncompatibleError("i", "e"); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 7454382412369..dd5a4ba5a4724 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -588,6 +588,19 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { } } + // cttz(Pow2) -> Log2(Pow2) + // ctlz(Pow2) -> BitWidth - 1 - Log2(Pow2) + if (auto *R = IC.tryGetLog2(Op0, match(Op1, m_One()))) { + if (IsTZ) + return IC.replaceInstUsesWith(II, R); + BinaryOperator *BO = BinaryOperator::CreateSub( + ConstantInt::get(R->getType(), R->getType()->getScalarSizeInBits() - 1), + R); + BO->setHasNoSignedWrap(); + BO->setHasNoUnsignedWrap(); + return BO; + } + KnownBits Known = IC.computeKnownBits(Op0, 0, &II); // Create a mask for bits above (ctlz) or below (cttz) the first known one. diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 8004552250b47..609678f9979c6 100644 --- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -214,8 +214,15 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI, Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(), DL, TLI, ObjSizeEval, IRB, SE); } - if (Or) + if (Or) { + if (Opts.GuardKind) { + llvm::Value *Allow = IRB.CreateIntrinsic( + IRB.getInt1Ty(), Intrinsic::allow_ubsan_check, + {llvm::ConstantInt::getSigned(IRB.getInt8Ty(), *Opts.GuardKind)}); + Or = IRB.CreateAnd(Or, Allow); + } TrapInfo.push_back(std::make_pair(&I, Or)); + } } std::string Name; @@ -299,5 +306,7 @@ void BoundsCheckingPass::printPipeline( } if (Opts.Merge) OS << ";merge"; + if (Opts.GuardKind) + OS << ";guard=" << static_cast(*Opts.GuardKind); OS << ">"; } diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 3812e99508f73..b5ce860d73523 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -1964,18 +1964,10 @@ NewGVN::ExprResult NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { if (PBranch->TrueEdge) { // If we know the previous predicate is true and we are in the true // edge then we may be implied true or false. - if (ICmpInst::isImpliedTrueByMatchingCmp(BranchPredicate, - OurPredicate)) { - return ExprResult::some( - createConstantExpression(ConstantInt::getTrue(CI->getType())), - PI); - } - - if (ICmpInst::isImpliedFalseByMatchingCmp(BranchPredicate, - OurPredicate)) { - return ExprResult::some( - createConstantExpression(ConstantInt::getFalse(CI->getType())), - PI); + if (auto R = ICmpInst::isImpliedByMatchingCmp(BranchPredicate, + OurPredicate)) { + auto *C = ConstantInt::getBool(CI->getType(), *R); + return ExprResult::some(createConstantExpression(C), PI); } } else { // Just handle the ne and eq cases, where if we have the same diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 7ddb9e22c8344..af9813775f242 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -975,6 +975,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::AllocatedPointer: case Attribute::AllocAlign: case Attribute::ByVal: + case Attribute::Captures: case Attribute::Dereferenceable: case Attribute::DereferenceableOrNull: case Attribute::ElementType: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d32a463a996c4..ee352c0b12302 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7531,6 +7531,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } continue; } + // The VPlan-based cost model is more accurate for partial reduction and + // comparing against the legacy cost isn't desirable. + if (isa(&R)) + return true; if (Instruction *UI = GetInstructionForCost(&R)) SeenInstrs.insert(UI); } @@ -7687,6 +7691,20 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( "AnyOf expected to start by comparing main resume value to original " "start value"); MainResumeValue = Cmp->getOperand(0); + } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + using namespace llvm::PatternMatch; + Value *Cmp, *OrigResumeV; + bool IsExpectedPattern = + match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), + m_Specific(RdxDesc.getSentinelValue()), + m_Value(OrigResumeV))) && + match(Cmp, + m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), + m_Specific(RdxDesc.getRecurrenceStartValue()))); + assert(IsExpectedPattern && "Unexpected reduction resume pattern"); + (void)IsExpectedPattern; + MainResumeValue = OrigResumeV; } PHINode *MainResumePhi = cast(MainResumeValue); @@ -8751,6 +8769,105 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, return Recipe; } +/// Find all possible partial reductions in the loop and track all of those that +/// are valid so recipes can be formed later. +void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { + // Find all possible partial reductions. + SmallVector, 1> + PartialReductionChains; + for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) + if (std::optional> Pair = + getScaledReduction(Phi, RdxDesc, Range)) + PartialReductionChains.push_back(*Pair); + + // A partial reduction is invalid if any of its extends are used by + // something that isn't another partial reduction. This is because the + // extends are intended to be lowered along with the reduction itself. + + // Build up a set of partial reduction bin ops for efficient use checking. + SmallSet PartialReductionBinOps; + for (const auto &[PartialRdx, _] : PartialReductionChains) + PartialReductionBinOps.insert(PartialRdx.BinOp); + + auto ExtendIsOnlyUsedByPartialReductions = + [&PartialReductionBinOps](Instruction *Extend) { + return all_of(Extend->users(), [&](const User *U) { + return PartialReductionBinOps.contains(U); + }); + }; + + // Check if each use of a chain's two extends is a partial reduction + // and only add those that don't have non-partial reduction users. + for (auto Pair : PartialReductionChains) { + PartialReductionChain Chain = Pair.first; + if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) && + ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) + ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair)); + } +} + +std::optional> +VPRecipeBuilder::getScaledReduction(PHINode *PHI, + const RecurrenceDescriptor &Rdx, + VFRange &Range) { + // TODO: Allow scaling reductions when predicating. The select at + // the end of the loop chooses between the phi value and most recent + // reduction result, both of which have different VFs to the active lane + // mask when scaling. + if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent())) + return std::nullopt; + + auto *Update = dyn_cast(Rdx.getLoopExitInstr()); + if (!Update) + return std::nullopt; + + Value *Op = Update->getOperand(0); + Value *PhiOp = Update->getOperand(1); + if (Op == PHI) { + Op = Update->getOperand(1); + PhiOp = Update->getOperand(0); + } + if (PhiOp != PHI) + return std::nullopt; + + auto *BinOp = dyn_cast(Op); + if (!BinOp || !BinOp->hasOneUse()) + return std::nullopt; + + using namespace llvm::PatternMatch; + Value *A, *B; + if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || + !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) + return std::nullopt; + + Instruction *ExtA = cast(BinOp->getOperand(0)); + Instruction *ExtB = cast(BinOp->getOperand(1)); + + TTI::PartialReductionExtendKind OpAExtend = + TargetTransformInfo::getPartialReductionExtendKind(ExtA); + TTI::PartialReductionExtendKind OpBExtend = + TargetTransformInfo::getPartialReductionExtendKind(ExtB); + + PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp); + + unsigned TargetScaleFactor = + PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( + A->getType()->getPrimitiveSizeInBits()); + + if (LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + InstructionCost Cost = TTI->getPartialReductionCost( + Update->getOpcode(), A->getType(), B->getType(), PHI->getType(), + VF, OpAExtend, OpBExtend, + std::make_optional(BinOp->getOpcode())); + return Cost.isValid(); + }, + Range)) + return std::make_pair(Chain, TargetScaleFactor); + + return std::nullopt; +} + VPRecipeBase * VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef Operands, @@ -8775,9 +8892,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, - CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc)); + + // If the PHI is used by a partial reduction, set the scale factor. + std::optional> Pair = + getScaledReductionForInstr(RdxDesc.getLoopExitInstr()); + unsigned ScaleFactor = Pair ? Pair->second : 1; + PhiRecipe = new VPReductionPHIRecipe( + Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), + CM.useOrderedReductions(RdxDesc), ScaleFactor); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate @@ -8809,6 +8931,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); + if (getScaledReductionForInstr(Instr)) + return tryToCreatePartialReduction(Instr, Operands); + if (!shouldWiden(Instr, Range)) return nullptr; @@ -8829,6 +8954,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return tryToWiden(Instr, Operands, VPBB); } +VPRecipeBase * +VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, + ArrayRef Operands) { + assert(Operands.size() == 2 && + "Unexpected number of operands for partial reduction"); + + VPValue *BinOp = Operands[0]; + VPValue *Phi = Operands[1]; + if (isa(BinOp->getDefiningRecipe())) + std::swap(BinOp, Phi); + + return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, + Reduction); +} + void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -9252,7 +9392,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, + Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further @@ -9298,6 +9439,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); return Legal->blockNeedsPredication(BB) || NeedsBlends; }); + + RecipeBuilder.collectScaledReductions(Range); + auto *MiddleVPBB = Plan->getMiddleBlock(); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { @@ -9521,7 +9665,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // Collect mapping of IR header phis to header phi recipes, to be used in // addScalarResumePhis. - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, + Builder); for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { if (isa(&R)) continue; @@ -10282,6 +10427,19 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, cast(ResumeV)->getParent()->getFirstNonPHI()); ResumeV = Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); + } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { + // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment + // to the resume value. The resume value is adjusted to the sentinel + // value when the final value from the main vector loop equals the start + // value. This ensures correctness when the start value might not be + // less than the minimum value of a monotonically increasing induction + // variable. + IRBuilder<> Builder( + cast(ResumeV)->getParent()->getFirstNonPHI()); + Value *Cmp = + Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); + ResumeV = + Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); } } else { // Retrieve the induction resume values for wide inductions from diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8a6fbd808de35..2742c3777c1ed 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2441,6 +2441,17 @@ class BoUpSLP { // operations or alternating sequences (e.g., +, -), we can safely // tell the inverse operations by checking commutativity. if (isa(VL[Lane])) { + if (auto *EI = dyn_cast(VL0)) { + if (OpIdx == 0) { + OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false}; + continue; + } + } else if (auto *EV = dyn_cast(VL0)) { + if (OpIdx == 0) { + OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false}; + continue; + } + } OpsVec[OpIdx][Lane] = { PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true, false}; @@ -8091,6 +8102,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, NonUniqueValueVL.append( PWSz - UniqueValues.size(), PoisonValue::get(UniqueValues.front()->getType())); + // Check that extended with poisons operations are still valid for + // vectorization (div/rem are not allowed). + if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) { + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); + return false; + } VL = NonUniqueValueVL; } return true; @@ -14935,8 +14953,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } ShuffleBuilder.add(*FrontTE, Mask); - Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors, - SubVectorsMask); + // Full matched entry found, no need to insert subvectors. + Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {}); return Res; } if (!Resized) { @@ -17818,7 +17836,7 @@ bool BoUpSLP::collectValuesToDemote( }; if (E.isGather() || !Visited.insert(&E).second || any_of(E.Scalars, [&](Value *V) { - return all_of(V->users(), [&](User *U) { + return !isa(V) && all_of(V->users(), [&](User *U) { return isa(U) && !getTreeEntry(U); }); })) @@ -19431,38 +19449,23 @@ class HorizontalReduction { return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, Name); } - case RecurKind::FMax: - return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); - case RecurKind::FMin: - return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); - case RecurKind::FMaximum: - return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS); - case RecurKind::FMinimum: - return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS); case RecurKind::SMax: - if (UseSelect) { - Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); case RecurKind::SMin: - if (UseSelect) { - Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); case RecurKind::UMax: - if (UseSelect) { - Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); case RecurKind::UMin: if (UseSelect) { - Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); + CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind); + Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name); return Builder.CreateSelect(Cmp, LHS, RHS, Name); } - return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS); + [[fallthrough]]; + case RecurKind::FMax: + case RecurKind::FMin: + case RecurKind::FMaximum: + case RecurKind::FMinimum: { + Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(Kind); + return Builder.CreateBinaryIntrinsic(Id, LHS, RHS); + } default: llvm_unreachable("Unknown reduction operation."); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 5d4a3b555981c..cf653e2d3e658 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -21,8 +21,28 @@ namespace llvm { class LoopVectorizationLegality; class LoopVectorizationCostModel; class TargetLibraryInfo; +class TargetTransformInfo; struct HistogramInfo; +/// A chain of instructions that form a partial reduction. +/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))), +/// accumulator). +struct PartialReductionChain { + PartialReductionChain(Instruction *Reduction, Instruction *ExtendA, + Instruction *ExtendB, Instruction *BinOp) + : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) { + } + /// The top-level binary operation that forms the reduction to a scalar + /// after the loop body. + Instruction *Reduction; + /// The extension of each of the inner binary operation's operands. + Instruction *ExtendA; + Instruction *ExtendB; + + /// The binary operation using the extends that is then reduced. + Instruction *BinOp; +}; + /// Helper class to create VPRecipies from IR instructions. class VPRecipeBuilder { /// The VPlan new recipes are added to. @@ -34,6 +54,9 @@ class VPRecipeBuilder { /// Target Library Info. const TargetLibraryInfo *TLI; + // Target Transform Info. + const TargetTransformInfo *TTI; + /// The legality analysis. LoopVectorizationLegality *Legal; @@ -63,6 +86,11 @@ class VPRecipeBuilder { /// created. SmallVector PhisToFix; + /// The set of reduction exit instructions that will be scaled to + /// a smaller VF via partial reductions, paired with the scaling factor. + DenseMap> + ScaledReductionExitInstrs; + /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p /// Range. The function should not be called for memory instructions or calls. @@ -111,13 +139,35 @@ class VPRecipeBuilder { VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI, ArrayRef Operands); + /// Examines reduction operations to see if the target can use a cheaper + /// operation with a wider per-iteration input VF and narrower PHI VF. + /// Returns null if no scaled reduction was found, otherwise a pair with a + /// struct containing reduction information and the scaling factor between the + /// number of elements in the input and output. + std::optional> + getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx, + VFRange &Range); + public: VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, PredicatedScalarEvolution &PSE, VPBuilder &Builder) - : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), - PSE(PSE), Builder(Builder) {} + : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), + CM(CM), PSE(PSE), Builder(Builder) {} + + std::optional> + getScaledReductionForInstr(const Instruction *ExitInst) { + auto It = ScaledReductionExitInstrs.find(ExitInst); + return It == ScaledReductionExitInstrs.end() + ? std::nullopt + : std::make_optional(It->second); + } + + /// Find all possible partial reductions in the loop and track all of those + /// that are valid so recipes can be formed later. + void collectScaledReductions(VFRange &Range); /// Create and return a widened recipe for \p I if one can be created within /// the given VF \p Range. @@ -125,6 +175,11 @@ class VPRecipeBuilder { ArrayRef Operands, VFRange &Range, VPBasicBlock *VPBB); + /// Create and return a partial reduction recipe for a reduction instruction + /// along with binary operation and reduction phi operands. + VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, + ArrayRef Operands); + /// Set the recipe created for given ingredient. void setRecipe(Instruction *I, VPRecipeBase *R) { assert(!Ingredient2Recipe.contains(I) && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index cfbb4ad32d681..1da185f9cfdf4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -883,6 +883,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: case VPRecipeBase::VPScalarCastSC: + case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: @@ -2384,23 +2385,28 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// The phi is part of an ordered reduction. Requires IsInLoop to be true. bool IsOrdered; + /// When expanding the reduction PHI, the plan's VF element count is divided + /// by this factor to form the reduction phi's VF. + unsigned VFScaleFactor = 1; + public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, - bool IsOrdered = false) + bool IsOrdered = false, unsigned VFScaleFactor = 1) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), - RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { + RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered), + VFScaleFactor(VFScaleFactor) { assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); } ~VPReductionPHIRecipe() override = default; VPReductionPHIRecipe *clone() override { - auto *R = - new VPReductionPHIRecipe(cast(getUnderlyingInstr()), RdxDesc, - *getOperand(0), IsInLoop, IsOrdered); + auto *R = new VPReductionPHIRecipe(cast(getUnderlyingInstr()), + RdxDesc, *getOperand(0), IsInLoop, + IsOrdered, VFScaleFactor); R->addOperand(getBackedgeValue()); return R; } @@ -2431,6 +2437,51 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, bool isInLoop() const { return IsInLoop; } }; +/// A recipe for forming partial reductions. In the loop, an accumulator and +/// vector operand are added together and passed to the next iteration as the +/// next accumulator. After the loop body, the accumulator is reduced to a +/// scalar value. +class VPPartialReductionRecipe : public VPSingleDefRecipe { + unsigned Opcode; + +public: + VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, + VPValue *Op1) + : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, + ReductionInst) {} + VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, + Instruction *ReductionInst = nullptr) + : VPSingleDefRecipe(VPDef::VPPartialReductionSC, + ArrayRef({Op0, Op1}), ReductionInst), + Opcode(Opcode) { + assert(isa(getOperand(1)->getDefiningRecipe()) && + "Unexpected operand order for partial reduction recipe"); + } + ~VPPartialReductionRecipe() override = default; + + VPPartialReductionRecipe *clone() override { + return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1)); + } + + VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) + + /// Generate the reduction in the loop. + void execute(VPTransformState &State) override; + + /// Return the cost of this VPPartialReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Get the binary op's opcode. + unsigned getOpcode() const { return Opcode; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPSingleDefRecipe { @@ -2640,7 +2691,7 @@ class VPReductionRecipe : public VPSingleDefRecipe { return R && classof(R); } - /// Generate the reduction in the loop + /// Generate the reduction in the loop. void execute(VPTransformState &State) override; /// Return the cost of VPReductionRecipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 35497a7431f76..8fea2c6fd33b6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -231,10 +231,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const auto *R) { return R->getScalarType(); }) .Case( - [this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe, + VPPartialReductionRecipe>([this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e54df8bdeac55..4057a51155ece 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -277,6 +277,72 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF, llvm_unreachable("subclasses should implement computeCost"); } +InstructionCost +VPPartialReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + std::optional Opcode = std::nullopt; + VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); + if (auto *WidenR = dyn_cast(BinOpR)) + Opcode = std::make_optional(WidenR->getOpcode()); + + VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe(); + VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe(); + + auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); + auto *InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) + : BinOpR->getOperand(0)); + auto *InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) + : BinOpR->getOperand(1)); + + auto GetExtendKind = [](VPRecipeBase *R) { + // The extend could come from outside the plan. + if (!R) + return TargetTransformInfo::PR_None; + auto *WidenCastR = dyn_cast(R); + if (!WidenCastR) + return TargetTransformInfo::PR_None; + if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) + return TargetTransformInfo::PR_ZeroExtend; + if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) + return TargetTransformInfo::PR_SignExtend; + return TargetTransformInfo::PR_None; + }; + + return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, + PhiType, VF, GetExtendKind(ExtAR), + GetExtendKind(ExtBR), Opcode); +} + +void VPPartialReductionRecipe::execute(VPTransformState &State) { + State.setDebugLocFrom(getDebugLoc()); + auto &Builder = State.Builder; + + assert(getOpcode() == Instruction::Add && + "Unhandled partial reduction opcode"); + + Value *BinOpVal = State.get(getOperand(0)); + Value *PhiVal = State.get(getOperand(1)); + assert(PhiVal && BinOpVal && "Phi and Mul must be set"); + + Type *RetTy = PhiVal->getType(); + + CallInst *V = Builder.CreateIntrinsic( + RetTy, Intrinsic::experimental_vector_partial_reduce_add, + {PhiVal, BinOpVal}, nullptr, "partial.reduce"); + + State.set(this, V); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "PARTIAL-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; + printOperands(O, SlotTracker); +} +#endif + FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); @@ -3356,6 +3422,10 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, void VPReductionPHIRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; + // If this phi is fed by a scaled reduction then it should output a + // vector with fewer elements than the VF. + ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); + // Reductions do not have to start at zero. They can start with // any loop invariant values. VPValue *StartVPV = getStartValue(); @@ -3366,8 +3436,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. bool ScalarPHI = State.VF.isScalar() || IsInLoop; - Type *VecTy = ScalarPHI ? StartV->getType() - : VectorType::get(StartV->getType(), State.VF); + Type *VecTy = + ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF); BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentParentLoop->getHeader() == HeaderBB && @@ -3417,13 +3487,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // Create start and identity vector values for the reduction in the // preheader. // TODO: Introduce recipes in VPlan preheader to create initial values. - Iden = Builder.CreateVectorSplat(State.VF, Iden); + Iden = Builder.CreateVectorSplat(VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); Constant *Zero = Builder.getInt32(0); StartV = Builder.CreateInsertElement(Iden, StartV, Zero); } else { - Iden = Builder.CreateVectorSplat(State.VF, Iden); + Iden = Builder.CreateVectorSplat(VF, Iden); } } } @@ -3441,6 +3511,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); + if (VFScaleFactor != 1) + O << " (VF scaled by 1/" << VFScaleFactor << ")"; } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 957a602091c73..7aaf4002b8b3e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -329,6 +329,7 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, + VPPartialReductionSC, VPReplicateSC, VPScalarCastSC, VPScalarIVStepsSC, diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll index 0bdcc35790148..e855578e794fa 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll @@ -194,3 +194,79 @@ loop: exit: ret void } + +; In the following test, the sink is loop-invariant. + +define void @type_size_equivalence_sink_loopinv(ptr nocapture %vec, i64 %n) { +; CHECK-LABEL: 'type_size_equivalence_sink_loopinv' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + %gep.n = getelementptr inbounds i64, ptr %vec, i64 %n + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + + %gep.iv = getelementptr i64, ptr %vec, i64 %iv + %ld.i64 = load i64, ptr %gep.iv, align 8 + + %ld.i64.i32 = trunc i64 %ld.i64 to i32 + store i32 %ld.i64.i32, ptr %gep.n, align 8 + + %iv.next = add nuw nsw i64 %iv, 1 + %cond = icmp eq i64 %iv.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +; Variant of the above, with a negative induction step and a gep exposing +; type-mismtach. + +define void @type_size_equivalence_sink_loopinv_negind(ptr nocapture %vec, i64 %n) { +; CHECK-LABEL: 'type_size_equivalence_sink_loopinv_negind' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + %minus.n = sub nsw i64 0, %n + %gep.minus.n = getelementptr inbounds i64, ptr %vec, i64 %minus.n + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + + %minus.iv = sub nsw i64 0, %iv + %gep.minus.iv = getelementptr i64, ptr %vec, i64 %minus.iv + %gep.minus.iv.4 = getelementptr i8, ptr %gep.minus.iv, i64 -4 + %ld.i64 = load i64, ptr %gep.minus.iv.4, align 8 + + %ld.i64.i32 = trunc i64 %ld.i64 to i32 + store i32 %ld.i64.i32, ptr %gep.minus.n, align 8 + + %iv.next = add nuw nsw i64 %iv, 1 + %cond = icmp eq i64 %iv.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll index 35cfadaa2965a..0e6db403512ae 100644 --- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll +++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll @@ -126,6 +126,19 @@ define i1 @sgt_implies_ge_via_assume(i32 %i, i32 %j) { ret i1 %i.ge.j } +define i1 @sgt_implies_false_le_via_assume(i32 %i, i32 %j) { +; CHECK-LABEL: define i1 @sgt_implies_false_le_via_assume( +; CHECK-SAME: i32 [[I:%.*]], i32 [[J:%.*]]) { +; CHECK-NEXT: [[I_SGT_J:%.*]] = icmp sgt i32 [[I]], [[J]] +; CHECK-NEXT: call void @llvm.assume(i1 [[I_SGT_J]]) +; CHECK-NEXT: ret i1 false +; + %i.sgt.j = icmp sgt i32 %i, %j + call void @llvm.assume(i1 %i.sgt.j) + %i.le.j = icmp samesign ule i32 %i, %j + ret i1 %i.le.j +} + define i32 @gt_implies_sge_dominating(i32 %a, i32 %len) { ; CHECK-LABEL: define i32 @gt_implies_sge_dominating( ; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) { @@ -150,6 +163,30 @@ end: ret i32 -1 } +define i32 @gt_implies_false_sle_dominating(i32 %a, i32 %len) { +; CHECK-LABEL: define i32 @gt_implies_false_sle_dominating( +; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A_GT_LEN:%.*]] = icmp samesign ugt i32 [[A]], [[LEN]] +; CHECK-NEXT: br i1 [[A_GT_LEN]], label %[[TAKEN:.*]], label %[[END:.*]] +; CHECK: [[TAKEN]]: +; CHECK-NEXT: ret i32 0 +; CHECK: [[END]]: +; CHECK-NEXT: ret i32 -1 +; +entry: + %a.gt.len = icmp samesign ugt i32 %a, %len + br i1 %a.gt.len, label %taken, label %end + +taken: + %a.sle.len = icmp sle i32 %a, %len + %res = select i1 %a.sle.len, i32 30, i32 0 + ret i32 %res + +end: + ret i32 -1 +} + define i32 @gt_implies_sge_dominating_cr(i32 %a, i32 %len) { ; CHECK-LABEL: define i32 @gt_implies_sge_dominating_cr( ; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) { diff --git a/llvm/test/Assembler/captures-errors.ll b/llvm/test/Assembler/captures-errors.ll new file mode 100644 index 0000000000000..44788c79a2453 --- /dev/null +++ b/llvm/test/Assembler/captures-errors.ll @@ -0,0 +1,73 @@ +; RUN: split-file --leading-lines %s %t +; RUN: not llvm-as < %t/missing-lparen.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-LPAREN +; RUN: not llvm-as < %t/missing-rparen.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-RPAREN +; RUN: not llvm-as < %t/missing-rparen-none.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-RPAREN-NONE +; RUN: not llvm-as < %t/missing-colon.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-COLON +; RUN: not llvm-as < %t/invalid-component.ll 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID-COMPONENT +; RUN: not llvm-as < %t/duplicate-ret.ll 2>&1 | FileCheck %s --check-prefix=CHECK-DUPLICATE-RET +; RUN: not llvm-as < %t/none-after.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NONE-AFTER +; RUN: not llvm-as < %t/none-before.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NONE-BEFORE +; RUN: not opt -disable-output < %t/non-pointer-type.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NON-POINTER-TYPE + +;--- missing-lparen.ll + +; CHECK-MISSING-LPAREN: :[[@LINE+1]]:32: error: expected '(' +define void @test(ptr captures %p) { + ret void +} + +;--- missing-rparen.ll + +; CHECK-MISSING-RPAREN: :[[@LINE+1]]:40: error: expected ',' or ')' +define void @test(ptr captures(address %p) { + ret void +} + +;--- missing-rparen-none.ll + +; CHECK-MISSING-RPAREN-NONE: :[[@LINE+1]]:37: error: expected ',' or ')' +define void @test(ptr captures(none %p) { + ret void +} + +;--- missing-colon.ll + +; CHECK-MISSING-COLON: :[[@LINE+1]]:36: error: expected ':' +define void @test(ptr captures(ret address) %p) { + ret void +} + +;--- invalid-component.ll + +; CHECK-INVALID-COMPONENT: :[[@LINE+1]]:32: error: expected one of 'none', 'address', 'address_is_null', 'provenance' or 'read_provenance' +define void @test(ptr captures(foo) %p) { + ret void +} + +;--- duplicate-ret.ll + +; CHECK-DUPLICATE-RET: :[[@LINE+1]]:51: error: duplicate 'ret' location +define void @test(ptr captures(ret: address, ret: provenance) %p) { + ret void +} + +;--- none-after.ll + +; CHECK-NONE-AFTER: :[[@LINE+1]]:45: error: cannot use 'none' with other component +define void @test(ptr captures(address, none) %p) { + ret void +} + +;--- none-before.ll + +; CHECK-NONE-BEFORE: :[[@LINE+1]]:38: error: cannot use 'none' with other component +define void @test(ptr captures(none, address) %p) { + ret void +} + +;--- non-pointer-type.ll + +; CHECK-NON-POINTER-TYPE: Attribute 'captures(none)' applied to incompatible type! +define void @test(i32 captures(none) %p) { + ret void +} diff --git a/llvm/test/Assembler/captures.ll b/llvm/test/Assembler/captures.ll new file mode 100644 index 0000000000000..1521a9df0cb42 --- /dev/null +++ b/llvm/test/Assembler/captures.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S < %s | FileCheck %s +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +define void @test_none(ptr captures(none) %p) { +; CHECK-LABEL: define void @test_none( +; CHECK-SAME: ptr captures(none) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address(ptr captures(address) %p) { +; CHECK-LABEL: define void @test_address( +; CHECK-SAME: ptr captures(address) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_is_null(ptr captures(address_is_null) %p) { +; CHECK-LABEL: define void @test_address_is_null( +; CHECK-SAME: ptr captures(address_is_null) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_provenance(ptr captures(address, provenance) %p) { +; CHECK-LABEL: define void @test_address_provenance( +; CHECK-SAME: ptr captures(address, provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_read_provenance(ptr captures(address, read_provenance) %p) { +; CHECK-LABEL: define void @test_address_read_provenance( +; CHECK-SAME: ptr captures(address, read_provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_ret(ptr captures(ret: address, provenance) %p) { +; CHECK-LABEL: define void @test_ret( +; CHECK-SAME: ptr captures(ret: address, provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_is_null_and_ret(ptr captures(address_is_null, ret: address, provenance) %p) { +; CHECK-LABEL: define void @test_address_is_null_and_ret( +; CHECK-SAME: ptr captures(address_is_null, ret: address, provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_and_ret_none(ptr captures(address, ret: none) %p) { +; CHECK-LABEL: define void @test_address_and_ret_none( +; CHECK-SAME: ptr captures(address, ret: none) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +; Duplicates callpse into one. +define void @test_duplicate(ptr captures(address, address) %p) { +; CHECK-LABEL: define void @test_duplicate( +; CHECK-SAME: ptr captures(address) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +; read_provenance is a subset of provenance. +define void @test_duplicate_read_provenance(ptr captures(read_provenance, provenance) %p) { +; CHECK-LABEL: define void @test_duplicate_read_provenance( +; CHECK-SAME: ptr captures(provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +; address_is_null is a subset of address. +define void @test_duplicate_address_is_null(ptr captures(address_is_null, address) %p) { +; CHECK-LABEL: define void @test_duplicate_address_is_null( +; CHECK-SAME: ptr captures(address) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +; Return-only none is same as plain none. +define void @test_ret_none(ptr captures(ret: none) %p) { +; CHECK-LABEL: define void @test_ret_none( +; CHECK-SAME: ptr captures(none) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll index 492de663884df..1da9291c71996 100644 --- a/llvm/test/Bitcode/attributes.ll +++ b/llvm/test/Bitcode/attributes.ll @@ -562,6 +562,11 @@ define void @initializes(ptr initializes((-4, 0), (4, 8)) %a) { ret void } +; CHECK: define void @captures(ptr captures(address) %p) +define void @captures(ptr captures(address) %p) { + ret void +} + ; CHECK: attributes #0 = { noreturn } ; CHECK: attributes #1 = { nounwind } ; CHECK: attributes #2 = { memory(none) } diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll index 0cead19a74bfd..478404dcd50aa 100644 --- a/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll +++ b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve2,+fp8,+fp8dot2,+fp8dot4 < %s | FileCheck %s -; RUN: llc -mattr=+sme,+fp8,+ssve-fp8dot2,+ssve-fp8dot4 --force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sve2,+fp8dot2,+fp8dot4 < %s | FileCheck %s +; RUN: llc -mattr=+sme,+ssve-fp8dot2,+ssve-fp8dot4 --force-streaming < %s | FileCheck %s target triple = "aarch64-linux" diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 97522b9a319c0..1f68815411097 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -82,3 +82,121 @@ define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "a %res = call i64 @agnostic_decl(i64 %v) ret i64 %res } + +; agnostic-ZA + streaming -> private-ZA + non-streaming +define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg +; CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: mov x0, x9 +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x20, sp +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i64 @private_za_decl(i64 %v) + %res2 = call i64 @private_za_decl(i64 %res) + ret i64 %res2 +} + +; agnostic-ZA + streaming-compatible -> private-ZA + non-streaming +define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg +; CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: mov x0, x9 +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz w20, #0, .LBB5_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: tbz w20, #0, .LBB5_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB5_4: +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz w20, #0, .LBB5_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB5_6: +; CHECK-NEXT: mov x0, x2 +; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: tbz w20, #0, .LBB5_8 +; CHECK-NEXT: // %bb.7: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB5_8: +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i64 @private_za_decl(i64 %v) + %res2 = call i64 @private_za_decl(i64 %res) + ret i64 %res2 +} diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll deleted file mode 100644 index 81d6d6369dcbf..0000000000000 --- a/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s - -define dso_local void @normal_call() local_unnamed_addr { -entry: - call void @a() - ret void -} -; CHECK-LABEL: normal_call: -; CHECK: bl a - -declare void @a() local_unnamed_addr - -; Even if there are no calls to imported functions, we still need to emit the -; .impcall section. - -; CHECK-LABEL .section .impcall,"yi" -; CHECK-NEXT .asciz "Imp_Call_V1" -; CHECK-NOT .secnum diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll deleted file mode 100644 index 6bb118ba1e159..0000000000000 --- a/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s --check-prefix=CHECK-ENABLED -; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-DISABLED - -; CHECK-DISABLED-NOT: .section .impcall - -define dso_local void @normal_call() local_unnamed_addr section "nc_sect" { -entry: - call void @a() - call void @a() - ret void -} -; CHECK-ENABLED-LABEL: normal_call: -; CHECK-ENABLED: adrp [[ADRPREG:x[0-9]+]], __imp_a -; CHECK-ENABLED-NEXT: ldr [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_a] -; CHECK-ENABLED-NEXT: .Limpcall0: -; CHECK-ENABLED-NEXT: blr [[LDRREG]] -; CHECK-ENABLED-NEXT: .Limpcall1: -; CHECK-ENABLED-NEXT: blr [[LDRREG]] - -define dso_local void @tail_call() local_unnamed_addr section "tc_sect" { -entry: - tail call void @b() - ret void -} -; CHECK-ENABLED-LABEL: tail_call: -; CHECK-ENABLED: adrp [[ADRPREG:x[0-9]+]], __imp_b -; CHECK-ENABLED-NEXT: ldr [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_b] -; CHECK-ENABLED-NEXT: .Limpcall2: -; CHECK-ENABLED-NEXT: br [[LDRREG]] - -declare dllimport void @a() local_unnamed_addr -declare dllimport void @b() local_unnamed_addr - -; CHECK-ENABLED-LABEL .section .impcall,"yi" -; CHECK-ENABLED-NEXT .asciz "Imp_Call_V1" -; CHECK-ENABLED-NEXT .word 32 -; CHECK-ENABLED-NEXT .secnum nc_sect -; CHECK-ENABLED-NEXT .word 19 -; CHECK-ENABLED-NEXT .secoffset .Limpcall0 -; CHECK-ENABLED-NEXT .symidx __imp_a -; CHECK-ENABLED-NEXT .word 19 -; CHECK-ENABLED-NEXT .secoffset .Limpcall1 -; CHECK-ENABLED-NEXT .symidx __imp_a -; CHECK-ENABLED-NEXT .word 20 -; CHECK-ENABLED-NEXT .secnum tc_sect -; CHECK-ENABLED-NEXT .word 19 -; CHECK-ENABLED-NEXT .secoffset .Limpcall2 -; CHECK-ENABLED-NEXT .symidx __imp_b diff --git a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll index 9391b50c04a5f..d7c8e47f98883 100644 --- a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs ; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s +; RUN: opt -data-layout=A5 -mtriple=amdgcn -passes=amdgpu-lower-enqueued-block -S < %s | FileCheck %s %struct.ndrange_t = type { i32 } %opencl.queue_t = type opaque diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll new file mode 100644 index 0000000000000..621187100f323 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + +define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) { +; CHECK-LABEL: load_idx_idy: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 6 +; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_addc_u32 s1, s1, s5 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:4 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_endpgm +entry: + %disp1 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + %gep_y = getelementptr i8, ptr addrspace(4) %disp1, i64 6 + %L = load i1, ptr addrspace(4) %gep_y, align 1 + %idxprom = sext i1 %L to i64 + %gep0 = getelementptr <32 x i16>, ptr addrspace(4) %disp, i64 %idxprom + %gep1 = getelementptr i8, ptr addrspace(4) %gep0, i64 4 + %L1 = load i8, ptr addrspace(4) %gep1 + store i8 %L1, ptr %g + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef nonnull align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 84a3a3e88d238..32d8aa18d9713 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -160,14 +160,9 @@ define amdgpu_kernel void @ceil_v2f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l -; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 9909cfd32b11f..f6a9fadb33865 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -161,14 +161,9 @@ define amdgpu_kernel void @floor_v2f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l -; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 53c26cadbf75a..ff1c3da1d5fe5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -480,9 +480,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32: @@ -610,9 +609,7 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16: @@ -737,15 +734,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32: @@ -891,12 +886,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16: @@ -1036,24 +1028,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000 -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v6.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32: @@ -1238,20 +1227,14 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v6.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll index e0b694ee58f0e..0359bb7183974 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll @@ -4,11 +4,19 @@ ; RUN: FileCheck --check-prefix=WARN-GFX7 %s < %t ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=bonaire -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX7,IR %s +; RUN: FileCheck --check-prefix=WARN-GFX7 %s < %t + ; RUN: llc -mtriple=amdgcn -mcpu=fiji -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX8,IR %s ; RUN: FileCheck --check-prefix=WARN-GFX8 %s < %t ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=fiji -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX8,IR %s +; RUN: FileCheck --check-prefix=WARN-GFX8 %s < %t + ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX9,GFX906,IR %s ; RUN: FileCheck --check-prefix=WARN-GFX906 %s < %t diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll index 32fed3ba22c59..676ba1480e6d2 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll @@ -4,11 +4,21 @@ ; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,REALTIME,MEMTIME %s +; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s + ; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s ; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t ; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s +; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s + ; Note: This test checks the IR, but also has a run line to codegen the file just to check we ; do not crash when trying to select those functions. diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll index 406c953a06d97..75a388eb1229b 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll @@ -12,10 +12,18 @@ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s + ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s + ; WARN-GFX906: removing function 'needs_wavefrontsize32': +wavefrontsize32 is not supported on the current target ; WARN-GFX906-NOT: not supported diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index e8f86a6ce63ff..e14666cdac5c2 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX89,VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -| FileCheck %s --check-prefixes=GFX89,GFX9 ; XXX - Why the packing? define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { @@ -43,6 +44,27 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: scalar_to_vector_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm %tmp1 = load i32, ptr addrspace(1) %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> @@ -90,6 +112,27 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: scalar_to_vector_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm %tmp1 = load float, ptr addrspace(1) %in, align 4 %bc = bitcast float %tmp1 to <2 x i16> %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> @@ -130,6 +173,23 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() { ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: scalar_to_vector_v4i16: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_lshl_b32 s1, s0, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm bb: %tmp = load <2 x i8>, ptr addrspace(1) undef, align 1 %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> @@ -176,6 +236,28 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() { ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: scalar_to_vector_v4f16: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_lshl_b32 s1, s0, 8 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s1, s0, 0xff00 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NEXT: s_or_b32 s1, s4, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s4, s1, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s1 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm bb: %load = load half, ptr addrspace(1) undef, align 1 %tmp = bitcast half %load to <2 x i8> @@ -235,18 +317,48 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: scalar_to_vector_test6: -; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: s_endpgm +; GFX89-LABEL: scalar_to_vector_test6: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 %bc = bitcast <4 x i8> %newvec0 to <2 x half> store <2 x half> %bc, ptr addrspace(1) %out ret void } + +; bitcast (scalar_to_vector x) -> any_extend x +define i64 @bitcast_combine_scalar_to_vector_v4i16(i16 %arg) { +; SI-LABEL: bitcast_combine_scalar_to_vector_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 +; SI-NEXT: v_or_b32_e32 v2, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: bitcast_combine_scalar_to_vector_v4i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_and_b32_e32 v1, 0xffffff00, v0 +; GFX89-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX89-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX89-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: s_setpc_b64 s[30:31] + %arg.cast = bitcast i16 %arg to <2 x i8> + %tmp1 = shufflevector <2 x i8> %arg.cast, <2 x i8> poison, <8 x i32> + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> poison, <8 x i32> + %cast = bitcast <8 x i8> %tmp2 to i64 + ret i64 %cast +} diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 245df6684384c..94b22b79f6632 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -237,14 +237,9 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -338,17 +333,13 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index bc1b102d33de1..2a2fd93bc2d0b 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -237,14 +237,9 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -338,17 +333,13 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 2eba67b06bae1..072151dd6f5a0 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-FAKE16 %s +; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-REAL16 %s +; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -38,6 +42,89 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32_v2f16: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32_v2f16: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -87,6 +174,89 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_subrev_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_subrev_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -136,6 +306,78 @@ define amdgpu_kernel void @fptrunc( ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: fptrunc: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-GCN-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-GCN-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-GCN-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: fptrunc: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s6, -1 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s10, s6 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s11, s7 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s8, s2 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s9, s3 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s4, s0 +; GFX11-GCN-REAL16-NEXT: buffer_load_b64 v[1:2], off, s[8:11], 0 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s5, s1 +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-GCN-REAL16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: fptrunc: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 +; GFX11-GISEL-REAL16-NEXT: v_cvt_f16_f32_e32 v0.h, s3 +; GFX11-GISEL-REAL16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-REAL16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-REAL16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { %a.val = load <2 x float>, ptr addrspace(1) %a @@ -178,6 +420,89 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32.fabs: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, |v1|, |v0| +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32.fabs: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, |v1|, |v0| +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32.fabs: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, |v0.l|, |v0.h| +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fabs: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, |v0.l|, |v0.h| +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -229,6 +554,89 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32.fneg: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, -v1, -v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32.fneg: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, -v1, -v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32.fneg: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fneg: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext diff --git a/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll b/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll new file mode 100644 index 0000000000000..1916cdf374455 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll @@ -0,0 +1,126 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s +; Before SM6.2 ByteAddressBuffer and StructuredBuffer lower to bufferStore. + +target triple = "dxil-pc-shadermodel6.1-compute" + +; CHECK-LABEL: define void @storef32_struct +define void @storef32_struct(i32 %index, float %data) { + %buffer = call target("dx.RawBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float %data, float undef, float undef, float undef, i8 1) + call void @llvm.dx.resource.store.rawbuffer.f32( + target("dx.RawBuffer", float, 1, 0, 0) %buffer, + i32 %index, i32 0, float %data) + + ret void +} + +; CHECK-LABEL: define void @storef32_byte +define void @storef32_byte(i32 %offset, float %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, float %data, float undef, float undef, float undef, i8 1) + call void @llvm.dx.resource.store.rawbuffer.f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, float %data) + + ret void +} + +; CHECK-LABEL: define void @storev4f32_struct +define void @storev4f32_struct(i32 %index, <4 x float> %data) { + %buffer = call target("dx.RawBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data) + + ret void +} + +; CHECK-LABEL: define void @storev4f32_byte +define void @storev4f32_byte(i32 %offset, <4 x float> %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, <4 x float> %data) + + ret void +} + +; CHECK-LABEL: define void @storeelements +define void @storeelements(i32 %index, <4 x float> %data0, <4 x i32> %data1) { + %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data0, i32 0 + ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data0, i32 1 + ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data0, i32 2 + ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data0, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data0) + + ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x i32> %data1, i32 0 + ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x i32> %data1, i32 1 + ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x i32> %data1, i32 2 + ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x i32> %data1, i32 3 + ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 16, i32 [[DATA1_0]], i32 [[DATA1_1]], i32 [[DATA1_2]], i32 [[DATA1_3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4i32( + target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer, + i32 %index, i32 16, <4 x i32> %data1) + + ret void +} + +; CHECK-LABEL: define void @storenested +define void @storenested(i32 %index, i32 %data0, <4 x float> %data1, <3 x half> %data2) { + %buffer = call + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i32 %data0, i32 undef, i32 undef, i32 undef, i8 1) + call void @llvm.dx.resource.store.rawbuffer.i32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 0, i32 %data0) + + ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x float> %data1, i32 0 + ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x float> %data1, i32 1 + ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x float> %data1, i32 2 + ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x float> %data1, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 4, float [[DATA1_0]], float [[DATA1_1]], float [[DATA1_2]], float [[DATA1_3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 4, <4 x float> %data1) + + ; CHECK: [[DATA2_0:%.*]] = extractelement <3 x half> %data2, i32 0 + ; CHECK: [[DATA2_1:%.*]] = extractelement <3 x half> %data2, i32 1 + ; CHECK: [[DATA2_2:%.*]] = extractelement <3 x half> %data2, i32 2 + ; CHECK: call void @dx.op.bufferStore.f16(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 20, half [[DATA2_0]], half [[DATA2_1]], half [[DATA2_2]], half undef, i8 7) + call void @llvm.dx.resource.store.rawbuffer.v3f16( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 20, <3 x half> %data2) + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll b/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll new file mode 100644 index 0000000000000..6a5274429930e --- /dev/null +++ b/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll @@ -0,0 +1,98 @@ +; RUN: opt -S -dxil-op-lower -dxil-translate-metadata -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s + +; This test make sure LLVM metadata is being translated into DXIL. + + +; CHECK: define i32 @test_branch(i32 %X) +; CHECK-NOT: hlsl.controlflow.hint +; CHECK: br i1 %cmp, label %if.then, label %if.else, !dx.controlflow.hints [[HINT_BRANCH:![0-9]+]] +define i32 @test_branch(i32 %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + + +; CHECK: define i32 @test_flatten(i32 %X) +; CHECK-NOT: hlsl.controlflow.hint +; CHECK: br i1 %cmp, label %if.then, label %if.else, !dx.controlflow.hints [[HINT_FLATTEN:![0-9]+]] +define i32 @test_flatten(i32 %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + + +; CHECK: define i32 @test_no_attr(i32 %X) +; CHECK-NOT: hlsl.controlflow.hint +; CHECK-NOT: !dx.controlflow.hints +define i32 @test_no_attr(i32 %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} +; CHECK-NOT: hlsl.controlflow.hint +; CHECK: [[HINT_BRANCH]] = !{!"dx.controlflow.hints", i32 1} +; CHECK: [[HINT_FLATTEN]] = !{!"dx.controlflow.hints", i32 2} +!0 = !{!"hlsl.controlflow.hint", i32 1} +!1 = !{!"hlsl.controlflow.hint", i32 2} diff --git a/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll b/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll new file mode 100644 index 0000000000000..a883a0bbc29fd --- /dev/null +++ b/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll @@ -0,0 +1,20 @@ +; We use llc for this test so that we don't abort after the first error. +; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s + +target triple = "dxil-pc-shadermodel6.2-compute" + +; Can't store 64 bit types directly until SM6.3 (byteaddressbuf.Store) +; CHECK: error: +; CHECK-SAME: in function storev4f64_byte +; CHECK-SAME: Cannot create RawBufferStore operation: Invalid overload type +define void @storev4f64_byte(i32 %offset, <4 x double> %data) "hlsl.export" { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + call void @llvm.dx.resource.store.rawbuffer.v4i64( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, <4 x double> %data) + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/RawBufferStore.ll b/llvm/test/CodeGen/DirectX/RawBufferStore.ll new file mode 100644 index 0000000000000..96824d5ee5a4a --- /dev/null +++ b/llvm/test/CodeGen/DirectX/RawBufferStore.ll @@ -0,0 +1,144 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +; CHECK-LABEL: define void @storef32_struct +define void @storef32_struct(i32 %index, float %data) { + %buffer = call target("dx.RawBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float %data, float undef, float undef, float undef, i8 1, i32 4) + call void @llvm.dx.resource.store.rawbuffer.f32( + target("dx.RawBuffer", float, 1, 0, 0) %buffer, + i32 %index, i32 0, float %data) + + ret void +} + +; CHECK-LABEL: define void @storef32_byte +define void @storef32_byte(i32 %offset, float %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, float %data, float undef, float undef, float undef, i8 1, i32 4) + call void @llvm.dx.resource.store.rawbuffer.f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, float %data) + + ret void +} + +; CHECK-LABEL: define void @storev4f32_struct +define void @storev4f32_struct(i32 %index, <4 x float> %data) { + %buffer = call target("dx.RawBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data) + + ret void +} + +; CHECK-LABEL: define void @storev4f32_byte +define void @storev4f32_byte(i32 %offset, <4 x float> %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, <4 x float> %data) + + ret void +} + +; CHECK-LABEL: define void @storeelements +define void @storeelements(i32 %index, <4 x float> %data0, <4 x i32> %data1) { + %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data0, i32 0 + ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data0, i32 1 + ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data0, i32 2 + ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data0, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data0) + + ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x i32> %data1, i32 0 + ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x i32> %data1, i32 1 + ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x i32> %data1, i32 2 + ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x i32> %data1, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 16, i32 [[DATA1_0]], i32 [[DATA1_1]], i32 [[DATA1_2]], i32 [[DATA1_3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4i32( + target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer, + i32 %index, i32 16, <4 x i32> %data1) + + ret void +} + +; CHECK-LABEL: define void @storenested +define void @storenested(i32 %index, i32 %data0, <4 x float> %data1, <3 x half> %data2) { + %buffer = call + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, i32 %data0, i32 undef, i32 undef, i32 undef, i8 1, i32 4) + call void @llvm.dx.resource.store.rawbuffer.i32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 0, i32 %data0) + + ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x float> %data1, i32 0 + ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x float> %data1, i32 1 + ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x float> %data1, i32 2 + ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x float> %data1, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 4, float [[DATA1_0]], float [[DATA1_1]], float [[DATA1_2]], float [[DATA1_3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 4, <4 x float> %data1) + + ; CHECK: [[DATA2_0:%.*]] = extractelement <3 x half> %data2, i32 0 + ; CHECK: [[DATA2_1:%.*]] = extractelement <3 x half> %data2, i32 1 + ; CHECK: [[DATA2_2:%.*]] = extractelement <3 x half> %data2, i32 2 + ; CHECK: call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 20, half [[DATA2_0]], half [[DATA2_1]], half [[DATA2_2]], half undef, i8 7, i32 2) + call void @llvm.dx.resource.store.rawbuffer.v3f16( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 20, <3 x half> %data2) + + ret void +} + +; byteaddressbuf.Store +; CHECK-LABEL: define void @storev4f64_byte +define void @storev4f64_byte(i32 %offset, <4 x double> %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x double> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x double> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x double> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x double> %data, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, double [[DATA0]], double [[DATA1]], double [[DATA2]], double [[DATA3]], i8 15, i32 8) + call void @llvm.dx.resource.store.rawbuffer.v4i64( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, <4 x double> %data) + + ret void +} diff --git a/llvm/test/CodeGen/MIR/AArch64/called-globals.mir b/llvm/test/CodeGen/MIR/AArch64/called-globals.mir deleted file mode 100644 index cf0f0a23e2d91..0000000000000 --- a/llvm/test/CodeGen/MIR/AArch64/called-globals.mir +++ /dev/null @@ -1,61 +0,0 @@ -# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s - ---- | - declare dllimport void @callee_func() local_unnamed_addr - - define dso_local void @caller() local_unnamed_addr { - entry: - call void @callee_func() - call void @callee_func() - ret void - } -... ---- -name: caller -stack: - - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '$x19', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -calledGlobals: - - bb: 0 - offset: 7 - callee: callee_func - flags: 144 - - bb: 0 - offset: 8 - callee: callee_func - flags: 144 -body: | - bb.0.entry: - liveins: $x19, $lr - - early-clobber $sp = frame-setup STRXpre killed $x19, $sp, -16 :: (store (s64) into %stack.1) - frame-setup SEH_SaveReg_X 19, -16 - frame-setup STRXui killed $lr, $sp, 1 :: (store (s64) into %stack.0) - frame-setup SEH_SaveReg 30, 8 - frame-setup SEH_PrologEnd - $x19 = ADRP target-flags(aarch64-page, aarch64-got, aarch64-dllimport) @callee_func - renamable $x19 = LDRXui killed $x19, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc, aarch64-dllimport) @callee_func - BLR renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp - BLR killed renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp - frame-destroy SEH_EpilogStart - $lr = frame-destroy LDRXui $sp, 1 :: (load (s64) from %stack.0) - frame-destroy SEH_SaveReg 30, 8 - early-clobber $sp, $x19 = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1) - frame-destroy SEH_SaveReg_X 19, -16 - frame-destroy SEH_EpilogEnd - RET undef $lr -... - -# CHECK-LABEL: calledGlobals: -# CHECK-NEXT: - bb: 0 -# CHECK-NEXT: offset: 7 -# CHECK-NEXT: callee: callee_func -# CHECK-NEXT: flags: 144 -# CHECK-NEXT: - bb: 0 -# CHECK-NEXT: offset: 8 -# CHECK-NEXT: callee: callee_func -# CHECK-NEXT: flags: 144 diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir index e4dab779216a8..096a80f77dbb6 100644 --- a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir @@ -1,5 +1,5 @@ # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s -# CHECK: baa instruction block out of range. Unable to reference bb:1 +# CHECK: baa call instruction block out of range. Unable to reference bb:1 --- | define dso_local i32 @baa(i32 %a) local_unnamed_addr { entry: diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir index 183610b326eeb..bd5b2451a8d76 100644 --- a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir @@ -1,5 +1,5 @@ # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s -# CHECK: baa instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1 +# CHECK: baa call instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1 --- | define dso_local i32 @baa(i32 %a) local_unnamed_addr { entry: diff --git a/llvm/test/CodeGen/NVPTX/convert-sm89.ll b/llvm/test/CodeGen/NVPTX/convert-sm89.ll index 5d0576aebbe08..30fd76f5a31c2 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm89.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm89.ll @@ -84,3 +84,10 @@ define <2 x half> @cvt_rn_relu_f16x2_e5m2x2(i16 %in) { %val = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 %in); ret <2 x half> %val } + +; CHECK-LABEL: cvt_rna_satfinite_tf32_f32 +define i32 @cvt_rna_satfinite_tf32_f32(float %f1) { +; CHECK: cvt.rna.satfinite.tf32.f32 + %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1) + ret i32 %val +} diff --git a/llvm/test/CodeGen/NVPTX/convert-sm90.ll b/llvm/test/CodeGen/NVPTX/convert-sm90.ll new file mode 100644 index 0000000000000..5f610e0e91f88 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/convert-sm90.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} + +declare i32 @llvm.nvvm.f2tf32.rn(float %f1) +declare i32 @llvm.nvvm.f2tf32.rn.relu(float %f1) +declare i32 @llvm.nvvm.f2tf32.rz(float %f1) +declare i32 @llvm.nvvm.f2tf32.rz.relu(float %f1) + +define i32 @cvt_rn_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rn_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rn.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rn(float %f1) + ret i32 %val +} + +define i32 @cvt_rn_relu_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rn_relu_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rn.relu.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rn.relu(float %f1) + ret i32 %val +} + +define i32 @cvt_rz_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rz_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rz.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rz(float %f1) + ret i32 %val +} + +define i32 @cvt_rz_relu_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rz_relu_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rz.relu.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rz.relu(float %f1) + ret i32 %val +} diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index c0fcc6f611111..a09261609d844 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -87,6 +87,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm %s -o - | FileCheck --check-prefix=RV32XQCICM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciint %s -o - | FileCheck --check-prefix=RV32XQCIINT %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s @@ -401,6 +402,7 @@ ; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2" ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2" ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2" +; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p2" ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2" ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2" ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0" diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll index 1ab3722080f70..447fc26b0106e 100644 --- a/llvm/test/CodeGen/RISCV/memcpy.ll +++ b/llvm/test/CodeGen/RISCV/memcpy.ll @@ -7,406 +7,676 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST ; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST -%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } -@src = external dso_local global %struct.x -@dst = external dso_local global %struct.x +; ---------------------------------------------------------------------- +; Fully unaligned cases -@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1 -@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1 -@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1 -@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 -@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 -@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 -@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 +define void @unaligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy0: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy0: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 false) + ret void +} -define i32 @t0() { -; RV32-LABEL: t0: +define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false) + ret void +} + +define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy2: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a0, %hi(src) -; RV32-NEXT: lw a1, %lo(src)(a0) -; RV32-NEXT: lui a2, %hi(dst) -; RV32-NEXT: addi a0, a0, %lo(src) -; RV32-NEXT: sw a1, %lo(dst)(a2) -; RV32-NEXT: lw a1, 4(a0) -; RV32-NEXT: lh a3, 8(a0) -; RV32-NEXT: lbu a0, 10(a0) -; RV32-NEXT: addi a2, a2, %lo(dst) -; RV32-NEXT: sw a1, 4(a2) -; RV32-NEXT: sh a3, 8(a2) -; RV32-NEXT: sb a0, 10(a2) -; RV32-NEXT: li a0, 0 +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t0: +; RV64-LABEL: unaligned_memcpy2: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a0, %hi(src) -; RV64-NEXT: lui a1, %hi(dst) -; RV64-NEXT: ld a2, %lo(src)(a0) -; RV64-NEXT: addi a0, a0, %lo(src) -; RV64-NEXT: lh a3, 8(a0) -; RV64-NEXT: lbu a0, 10(a0) -; RV64-NEXT: sd a2, %lo(dst)(a1) -; RV64-NEXT: addi a1, a1, %lo(dst) -; RV64-NEXT: sh a3, 8(a1) -; RV64-NEXT: sb a0, 10(a1) -; RV64-NEXT: li a0, 0 +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t0: +; RV32-FAST-LABEL: unaligned_memcpy2: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a0, %hi(src) -; RV32-FAST-NEXT: lw a1, %lo(src)(a0) -; RV32-FAST-NEXT: addi a0, a0, %lo(src) -; RV32-FAST-NEXT: lw a2, 4(a0) -; RV32-FAST-NEXT: lw a0, 7(a0) -; RV32-FAST-NEXT: lui a3, %hi(dst) -; RV32-FAST-NEXT: sw a1, %lo(dst)(a3) -; RV32-FAST-NEXT: addi a1, a3, %lo(dst) -; RV32-FAST-NEXT: sw a0, 7(a1) -; RV32-FAST-NEXT: sw a2, 4(a1) -; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t0: +; RV64-FAST-LABEL: unaligned_memcpy2: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a0, %hi(src) -; RV64-FAST-NEXT: ld a1, %lo(src)(a0) -; RV64-FAST-NEXT: addi a0, a0, %lo(src) -; RV64-FAST-NEXT: lw a0, 7(a0) -; RV64-FAST-NEXT: lui a2, %hi(dst) -; RV64-FAST-NEXT: sd a1, %lo(dst)(a2) -; RV64-FAST-NEXT: addi a1, a2, %lo(dst) -; RV64-FAST-NEXT: sw a0, 7(a1) -; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false) - ret i32 0 + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) + ret void } -define void @t1(ptr nocapture %C) nounwind { -; RV32-LABEL: t1: +define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy3: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str1) -; RV32-NEXT: addi a1, a1, %lo(.L.str1) -; RV32-NEXT: li a2, 31 +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy3: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy3: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lbu a2, 2(a1) +; RV32-FAST-NEXT: sb a2, 2(a0) +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy3: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lbu a2, 2(a1) +; RV64-FAST-NEXT: sb a2, 2(a0) +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + ret void +} + +define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy4: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy4: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy4: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy4: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false) + ret void +} + +define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy7: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy7: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy7: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy7: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false) + ret void +} + +define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy8: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false) + ret void +} + +define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy15: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 15 ; RV32-NEXT: tail memcpy ; -; RV64-LABEL: t1: +; RV64-LABEL: unaligned_memcpy15: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str1) -; RV64-NEXT: addi a1, a1, %lo(.L.str1) -; RV64-NEXT: li a2, 31 +; RV64-NEXT: li a2, 15 ; RV64-NEXT: tail memcpy ; -; RV32-FAST-LABEL: t1: +; RV32-FAST-LABEL: unaligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1141 -; RV32-FAST-NEXT: lui a2, 300325 -; RV32-FAST-NEXT: lui a3, 132181 -; RV32-FAST-NEXT: lui a4, 340483 -; RV32-FAST-NEXT: lui a5, 267556 -; RV32-FAST-NEXT: lui a6, 337154 -; RV32-FAST-NEXT: addi a1, a1, -439 -; RV32-FAST-NEXT: sw a1, 27(a0) -; RV32-FAST-NEXT: lui a1, 320757 -; RV32-FAST-NEXT: addi a2, a2, 1107 -; RV32-FAST-NEXT: addi a3, a3, -689 -; RV32-FAST-NEXT: addi a4, a4, -947 -; RV32-FAST-NEXT: sw a4, 16(a0) -; RV32-FAST-NEXT: sw a3, 20(a0) -; RV32-FAST-NEXT: sw a2, 24(a0) -; RV32-FAST-NEXT: lui a2, 365861 -; RV32-FAST-NEXT: addi a3, a5, 1871 -; RV32-FAST-NEXT: addi a4, a6, 69 -; RV32-FAST-NEXT: addi a1, a1, 1107 -; RV32-FAST-NEXT: addi a2, a2, -1980 -; RV32-FAST-NEXT: sw a2, 0(a0) -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: sw a4, 8(a0) -; RV32-FAST-NEXT: sw a3, 12(a0) +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t1: +; RV64-FAST-LABEL: unaligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str1) -; RV64-FAST-NEXT: addi a2, a1, %lo(.L.str1) -; RV64-FAST-NEXT: ld a3, 23(a2) -; RV64-FAST-NEXT: ld a1, %lo(.L.str1)(a1) -; RV64-FAST-NEXT: ld a4, 8(a2) -; RV64-FAST-NEXT: ld a2, 16(a2) -; RV64-FAST-NEXT: sd a3, 23(a0) +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) ; RV64-FAST-NEXT: sd a1, 0(a0) -; RV64-FAST-NEXT: sd a4, 8(a0) -; RV64-FAST-NEXT: sd a2, 16(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false) ret void } -define void @t2(ptr nocapture %C) nounwind { -; RV32-BOTH-LABEL: t2: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lui a1, %hi(.L.str2) -; RV32-BOTH-NEXT: addi a1, a1, %lo(.L.str2) -; RV32-BOTH-NEXT: li a2, 36 -; RV32-BOTH-NEXT: tail memcpy +define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 16 +; RV32-NEXT: tail memcpy ; -; RV64-LABEL: t2: +; RV64-LABEL: unaligned_memcpy16: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str2) -; RV64-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-NEXT: li a2, 36 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: tail memcpy ; -; RV64-FAST-LABEL: t2: +; RV32-FAST-LABEL: unaligned_memcpy16: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy16: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str2) -; RV64-FAST-NEXT: lui a2, 1156 -; RV64-FAST-NEXT: ld a3, %lo(.L.str2)(a1) -; RV64-FAST-NEXT: addi a2, a2, 332 -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-FAST-NEXT: sw a2, 32(a0) ; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: ld a4, 16(a1) -; RV64-FAST-NEXT: ld a1, 24(a1) -; RV64-FAST-NEXT: sd a3, 0(a0) ; RV64-FAST-NEXT: sd a2, 8(a0) -; RV64-FAST-NEXT: sd a4, 16(a0) -; RV64-FAST-NEXT: sd a1, 24(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false) ret void } -define void @t3(ptr nocapture %C) nounwind { -; RV32-LABEL: t3: +define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str3) -; RV32-NEXT: addi a1, a1, %lo(.L.str3) -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 31 ; RV32-NEXT: tail memcpy ; -; RV64-LABEL: t3: +; RV64-LABEL: unaligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str3) -; RV64-NEXT: addi a1, a1, %lo(.L.str3) -; RV64-NEXT: li a2, 24 +; RV64-NEXT: li a2, 31 ; RV64-NEXT: tail memcpy ; -; RV32-FAST-LABEL: t3: +; RV32-FAST-LABEL: unaligned_memcpy31: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1109 -; RV32-FAST-NEXT: lui a2, 340483 -; RV32-FAST-NEXT: lui a3, 267556 -; RV32-FAST-NEXT: lui a4, 337154 -; RV32-FAST-NEXT: lui a5, 320757 -; RV32-FAST-NEXT: addi a1, a1, -689 -; RV32-FAST-NEXT: addi a2, a2, -947 +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) ; RV32-FAST-NEXT: sw a2, 16(a0) -; RV32-FAST-NEXT: sw a1, 20(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a2, a3, 1871 -; RV32-FAST-NEXT: addi a3, a4, 69 -; RV32-FAST-NEXT: addi a4, a5, 1107 -; RV32-FAST-NEXT: addi a1, a1, -1980 -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: sw a4, 4(a0) -; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) ; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t3: +; RV64-FAST-LABEL: unaligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str3) -; RV64-FAST-NEXT: ld a2, %lo(.L.str3)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str3) -; RV64-FAST-NEXT: ld a3, 8(a1) -; RV64-FAST-NEXT: ld a1, 16(a1) -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a3, 8(a0) -; RV64-FAST-NEXT: sd a1, 16(a0) +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) + ret void +} + +; ---------------------------------------------------------------------- +; Fully aligned cases + +define void @aligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy0: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy0: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 0, i1 false) + ret void +} + +define void @aligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 1, i1 false) + ret void +} + +define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy2: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy2: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false) + ret void +} + +define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy3: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a2, 2(a1) +; RV32-BOTH-NEXT: sb a2, 2(a0) +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy3: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a2, 2(a1) +; RV64-BOTH-NEXT: sb a2, 2(a0) +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false) ret void } -define void @t4(ptr nocapture %C) nounwind { -; RV32-LABEL: t4: +define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy4: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy4: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lw a1, 0(a1) +; RV64-BOTH-NEXT: sw a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false) + ret void +} + +define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy7: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str4) -; RV32-NEXT: addi a1, a1, %lo(.L.str4) -; RV32-NEXT: li a2, 18 -; RV32-NEXT: tail memcpy +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lh a2, 4(a1) +; RV32-NEXT: sh a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t4: +; RV64-LABEL: aligned_memcpy7: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str4) -; RV64-NEXT: addi a1, a1, %lo(.L.str4) -; RV64-NEXT: li a2, 18 -; RV64-NEXT: tail memcpy +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lh a2, 4(a1) +; RV64-NEXT: sh a2, 4(a0) +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a1, 0(a0) +; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t4: +; RV32-FAST-LABEL: aligned_memcpy7: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: li a1, 32 -; RV32-FAST-NEXT: lui a2, 132388 -; RV32-FAST-NEXT: lui a3, 337154 -; RV32-FAST-NEXT: lui a4, 320757 -; RV32-FAST-NEXT: sh a1, 16(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a2, a2, 1871 -; RV32-FAST-NEXT: addi a3, a3, 69 -; RV32-FAST-NEXT: addi a4, a4, 1107 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: sw a4, 4(a0) -; RV32-FAST-NEXT: sw a3, 8(a0) -; RV32-FAST-NEXT: sw a2, 12(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t4: +; RV64-FAST-LABEL: aligned_memcpy7: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str4) -; RV64-FAST-NEXT: ld a2, %lo(.L.str4)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str4) -; RV64-FAST-NEXT: ld a1, 8(a1) -; RV64-FAST-NEXT: li a3, 32 -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) -; RV64-FAST-NEXT: sh a3, 16(a0) +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false) + ret void +} + +define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy8: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy8: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false) ret void } -define void @t5(ptr nocapture %C) nounwind { -; RV32-LABEL: t5: +define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy15: ; RV32: # %bb.0: # %entry -; RV32-NEXT: li a1, 84 -; RV32-NEXT: li a2, 83 -; RV32-NEXT: li a3, 89 -; RV32-NEXT: li a4, 82 -; RV32-NEXT: li a5, 72 -; RV32-NEXT: li a6, 68 -; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: sb a1, 5(a0) -; RV32-NEXT: sb zero, 6(a0) -; RV32-NEXT: sb a6, 0(a0) -; RV32-NEXT: sb a5, 1(a0) -; RV32-NEXT: sb a4, 2(a0) -; RV32-NEXT: sb a3, 3(a0) +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lh a2, 12(a1) +; RV32-NEXT: sh a2, 12(a0) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t5: +; RV64-LABEL: aligned_memcpy15: ; RV64: # %bb.0: # %entry -; RV64-NEXT: li a1, 84 -; RV64-NEXT: li a2, 83 -; RV64-NEXT: li a3, 89 -; RV64-NEXT: li a4, 82 -; RV64-NEXT: li a5, 72 -; RV64-NEXT: li a6, 68 -; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: sb a1, 5(a0) -; RV64-NEXT: sb zero, 6(a0) -; RV64-NEXT: sb a6, 0(a0) -; RV64-NEXT: sb a5, 1(a0) -; RV64-NEXT: sb a4, 2(a0) -; RV64-NEXT: sb a3, 3(a0) +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lh a2, 12(a1) +; RV64-NEXT: sh a2, 12(a0) +; RV64-NEXT: lw a2, 8(a1) +; RV64-NEXT: sw a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t5: +; RV32-FAST-LABEL: aligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1349 -; RV32-FAST-NEXT: addi a1, a1, 857 -; RV32-FAST-NEXT: sw a1, 3(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t5: +; RV64-FAST-LABEL: aligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, 1349 -; RV64-FAST-NEXT: addi a1, a1, 857 -; RV64-FAST-NEXT: sw a1, 3(a0) -; RV64-FAST-NEXT: lui a1, 365861 -; RV64-FAST-NEXT: addi a1, a1, -1980 -; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false) ret void } -define void @t6() nounwind { -; RV32-LABEL: t6: +define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy16: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 12(a1) +; RV32-BOTH-NEXT: sw a2, 12(a0) +; RV32-BOTH-NEXT: lw a2, 8(a1) +; RV32-BOTH-NEXT: sw a2, 8(a0) +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy16: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a2, 8(a1) +; RV64-BOTH-NEXT: sd a2, 8(a0) +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false) + ret void +} + +define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a0, %hi(spool.splbuf) -; RV32-NEXT: addi a0, a0, %lo(spool.splbuf) -; RV32-NEXT: lui a1, %hi(.L.str6) -; RV32-NEXT: addi a1, a1, %lo(.L.str6) -; RV32-NEXT: li a2, 14 -; RV32-NEXT: call memcpy -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32-NEXT: li a2, 31 +; RV32-NEXT: tail memcpy ; -; RV64-LABEL: t6: +; RV64-LABEL: aligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a0, %hi(spool.splbuf) -; RV64-NEXT: addi a0, a0, %lo(spool.splbuf) -; RV64-NEXT: lui a1, %hi(.L.str6) -; RV64-NEXT: addi a1, a1, %lo(.L.str6) -; RV64-NEXT: li a2, 14 -; RV64-NEXT: call memcpy -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lbu a2, 30(a1) +; RV64-NEXT: sb a2, 30(a0) +; RV64-NEXT: lh a2, 28(a1) +; RV64-NEXT: sh a2, 28(a0) +; RV64-NEXT: lw a2, 24(a1) +; RV64-NEXT: sw a2, 24(a0) +; RV64-NEXT: ld a2, 16(a1) +; RV64-NEXT: sd a2, 16(a0) +; RV64-NEXT: ld a2, 8(a1) +; RV64-NEXT: sd a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t6: +; RV32-FAST-LABEL: aligned_memcpy31: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a0, %hi(spool.splbuf) -; RV32-FAST-NEXT: li a1, 88 -; RV32-FAST-NEXT: sh a1, %lo(spool.splbuf+12)(a0) -; RV32-FAST-NEXT: lui a1, 361862 -; RV32-FAST-NEXT: addi a1, a1, -1960 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+8)(a0) -; RV32-FAST-NEXT: lui a1, 362199 -; RV32-FAST-NEXT: addi a1, a1, 559 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+4)(a0) -; RV32-FAST-NEXT: lui a1, 460503 -; RV32-FAST-NEXT: addi a1, a1, 1071 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf)(a0) +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t6: +; RV64-FAST-LABEL: aligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a0, %hi(.L.str6) -; RV64-FAST-NEXT: ld a1, %lo(.L.str6)(a0) -; RV64-FAST-NEXT: addi a0, a0, %lo(.L.str6) -; RV64-FAST-NEXT: ld a0, 6(a0) -; RV64-FAST-NEXT: lui a2, %hi(spool.splbuf) -; RV64-FAST-NEXT: sd a1, %lo(spool.splbuf)(a2) -; RV64-FAST-NEXT: sd a0, %lo(spool.splbuf+6)(a2) +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) ret void } -%struct.Foo = type { i32, i32, i32, i32 } +; ------------------------------------------------------------------------ +; A few partially aligned cases + -define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { -; RV32-BOTH-LABEL: t7: +define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { +; RV32-BOTH-LABEL: memcpy16_align4: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 12(a1) ; RV32-BOTH-NEXT: sw a2, 12(a0) @@ -418,7 +688,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; -; RV64-LABEL: t7: +; RV64-LABEL: memcpy16_align4: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lw a2, 12(a1) ; RV64-NEXT: sw a2, 12(a0) @@ -430,7 +700,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV64-NEXT: sw a1, 0(a0) ; RV64-NEXT: ret ; -; RV64-FAST-LABEL: t7: +; RV64-FAST-LABEL: memcpy16_align4: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 8(a1) ; RV64-FAST-NEXT: sd a2, 8(a0) @@ -438,11 +708,58 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false) + tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false) ret void } +define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) { +; RV32-LABEL: memcpy11_align8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lh a2, 8(a1) +; RV32-NEXT: sh a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: memcpy11_align8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lh a2, 8(a1) +; RV64-NEXT: sh a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memcpy11_align8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 7(a1) +; RV32-FAST-NEXT: sw a2, 7(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memcpy11_align8: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 7(a1) +; RV64-FAST-NEXT: sw a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: ret +entry: + call void @llvm.memcpy.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false) + ret i32 0 +} + declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; RV64-BOTH: {{.*}} diff --git a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll new file mode 100644 index 0000000000000..9911b3119ce52 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll @@ -0,0 +1,90 @@ +; RUN: opt -passes='spirv-structurizer' -S -mtriple=spirv-unknown-unknown %s | FileCheck %s + +; CHECK-LABEL: define spir_func noundef i32 @test_branch +; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_branch, %if.end), i32 1) +; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !{{[0-9]+}} +define spir_func noundef i32 @test_branch(i32 noundef %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + +; CHECK-LABEL: define spir_func noundef i32 @test_flatten +; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_flatten, %if.end), i32 2) +; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !{{[0-9]+}} +define spir_func noundef i32 @test_flatten(i32 noundef %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} +; CHECK-LABEL: define spir_func noundef i32 @test_no_attr +; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_no_attr, %if.end), i32 0) +; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else +define spir_func noundef i32 @test_no_attr(i32 noundef %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + +!0 = !{!"hlsl.controlflow.hint", i32 1} +!1 = !{!"hlsl.controlflow.hint", i32 2} diff --git a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll new file mode 100644 index 0000000000000..848eaf70f5a19 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll @@ -0,0 +1,91 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + + +define spir_func noundef i32 @test_branch(i32 noundef %X) { +entry: +; CHECK-LABEL: ; -- Begin function test_branch +; OpSelectionMerge %[[#]] DontFlatten + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + + +define spir_func noundef i32 @test_flatten(i32 noundef %X) { +entry: +; CHECK-LABEL: ; -- Begin function test_flatten +; OpSelectionMerge %[[#]] Flatten + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + +define spir_func noundef i32 @test_no_attr(i32 noundef %X) { +entry: +; CHECK-LABEL: ; -- Begin function test_no_attr +; OpSelectionMerge %[[#]] None + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + +!0 = !{!"hlsl.controlflow.hint", i32 1} +!1 = !{!"hlsl.controlflow.hint", i32 2} diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 1305559bc04e0..3d72319f59ca9 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1337,10 +1337,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] -; AVX512BW-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1789,10 +1788,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1808,10 +1806,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -1827,10 +1824,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 5d901a8a380a9..aac5847061cbe 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -149,9 +149,10 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -160,11 +161,12 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -176,11 +178,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -192,11 +194,12 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -208,11 +211,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -256,9 +259,10 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -267,11 +271,12 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -283,11 +288,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -579,9 +584,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -590,11 +595,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) { define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -606,11 +611,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -622,11 +626,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -638,11 +642,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -686,9 +689,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -697,11 +700,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) { define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -713,11 +716,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -810,11 +812,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] -; CHECK-NEXT: vmovdqa (%rdi), %ymm3 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -827,11 +829,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -844,10 +845,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -857,11 +857,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm3 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -874,11 +874,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -1082,11 +1081,12 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,0,0,3] -; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1098,11 +1098,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1567,9 +1567,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1578,11 +1578,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12] -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1594,11 +1594,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1610,10 +1609,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6] -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6] +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1623,11 +1621,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] -; CHECK-NEXT: vmovdqa (%rdi), %ymm3 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1640,11 +1638,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1691,11 +1688,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm3 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1708,11 +1705,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,15,6,9] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -2474,9 +2470,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2] -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: @@ -2492,11 +2488,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,0,0,2] -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: @@ -2516,11 +2512,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: @@ -2572,11 +2567,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,7,1] -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: @@ -2596,11 +2591,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: @@ -2620,9 +2614,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2] -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2631,11 +2625,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,2,3,2] -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2647,11 +2641,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -3032,12 +3025,13 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,1,3,7] -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3049,12 +3043,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3066,9 +3060,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3] -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3077,12 +3072,13 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,3] -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3094,12 +3090,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3424,9 +3420,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %v define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3435,12 +3431,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) { define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3452,12 +3448,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3469,12 +3464,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8] -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [11,0,9,0,7,14,0,8] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3486,12 +3481,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3724,10 +3718,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9] +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3737,12 +3730,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm3 -; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3755,12 +3748,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -4346,9 +4338,9 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,6,7,2] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4357,12 +4349,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,7,2] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,6,7,2] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4374,12 +4366,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,6,7,2] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4441,12 +4432,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4] -; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 -; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: @@ -4467,12 +4458,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4] -; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: @@ -4493,9 +4483,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4504,12 +4494,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,2,1,0] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4521,12 +4511,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/pr97968.ll b/llvm/test/CodeGen/X86/pr97968.ll index ca5c63cdc1c2e..a539a33e9a281 100644 --- a/llvm/test/CodeGen/X86/pr97968.ll +++ b/llvm/test/CodeGen/X86/pr97968.ll @@ -5,8 +5,8 @@ define <2 x i32> @PR97968(<16 x i32> %a0) { ; CHECK-LABEL: PR97968: ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,7,2,7] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %sub0 = shufflevector <16 x i32> %a0, <16 x i32> poison, <4 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index 45842d4148a8b..82c460fc55938 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -65,10 +65,9 @@ define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BWVL-FAST-ALL: # %bb.0: -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-FAST-ALL-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; AVX512BWVL-FAST-ALL-NEXT: vmovaps %ymm0, (%rsi) ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper ; AVX512BWVL-FAST-ALL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index e7557134b1486..1d82d57e5552f 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -453,9 +453,8 @@ define <4 x double> @PR34175(ptr %p) { ; AVX512BWVL-LABEL: PR34175: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512BWVL-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; @@ -472,9 +471,8 @@ define <4 x double> @PR34175(ptr %p) { ; AVX512VBMIVL-LABEL: PR34175: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512VBMIVL-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VBMIVL-NEXT: retq %v = load <32 x i16>, ptr %p, align 2 diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index f1fd05565c47e..df8a85fd07258 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -389,7 +389,7 @@ define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -452,7 +452,7 @@ define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -599,7 +599,7 @@ define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -694,7 +694,7 @@ define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec128_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -1003,7 +1003,7 @@ define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1079,7 +1079,7 @@ define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1355,7 +1355,7 @@ define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1550,7 +1550,7 @@ define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec256_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2170,7 +2170,7 @@ define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2258,7 +2258,7 @@ define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2722,7 +2722,7 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v3i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi) ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: movw %ax, (%rsi) @@ -3006,7 +3006,7 @@ define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v3i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) ; AVX512-NEXT: vmovd %xmm0, (%rsi) ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) @@ -3664,7 +3664,7 @@ define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -3983,7 +3983,7 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v6i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) ; AVX512-NEXT: vmovd %xmm0, (%rsi) ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) @@ -4420,7 +4420,7 @@ define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -5444,7 +5444,7 @@ define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5540,7 +5540,7 @@ define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5965,7 +5965,7 @@ define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -6363,7 +6363,7 @@ define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec512_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 0cefc1c32d71b..a39bc6b668669 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -345,66 +345,66 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i16_stride3_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride3_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride3_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <4 x i32> @@ -629,64 +629,60 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i16_stride3_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride3_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride3_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 68e92d7cf773f..739e6e2369e36 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -596,24 +596,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-NEXT: vmovq %xmm2, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -623,24 +621,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpextrw $7, %xmm2, %eax -; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -650,24 +646,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpextrw $7, %xmm2, %eax -; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -677,24 +671,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm2, %eax -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 751412c77a59a..c3b53211978ae 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -293,8 +293,8 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm5 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vpermw (%rdi), %ymm5, %ymm5 ; AVX512BW-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 ; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -307,6 +307,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovd %xmm5, (%r8) ; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r9) ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rax) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride6_vf2: @@ -346,8 +347,8 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermw (%rdi), %ymm5, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -360,6 +361,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> @@ -580,21 +582,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermd (%rdi), %zmm1, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] -; AVX512-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vmovq %xmm0, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) +; AVX512-NEXT: vmovq %xmm2, (%rcx) ; AVX512-NEXT: vmovq %xmm1, (%r8) -; AVX512-NEXT: vmovq %xmm2, (%r9) -; AVX512-NEXT: vmovq %xmm4, (%rax) +; AVX512-NEXT: vmovq %xmm4, (%r9) +; AVX512-NEXT: vmovq %xmm5, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -612,21 +613,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] -; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm2, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rax) +; AVX512-FCP-NEXT: vmovq %xmm4, (%r9) +; AVX512-FCP-NEXT: vmovq %xmm5, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -645,21 +645,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermd (%rdi), %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] -; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-NEXT: vmovq %xmm2, (%r9) -; AVX512DQ-NEXT: vmovq %xmm4, (%rax) +; AVX512DQ-NEXT: vmovq %xmm4, (%r9) +; AVX512DQ-NEXT: vmovq %xmm5, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -677,21 +676,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -699,25 +697,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%rax) +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -725,25 +722,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%rax) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -751,25 +747,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%rax) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -777,25 +772,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i16>, ptr %in.vec, align 64 @@ -2865,224 +2859,228 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i16_stride6_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vpermw %zmm5, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: vpermw %zmm5, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512BW-NEXT: vpermw %zmm5, %zmm8, %zmm5 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) -; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512BW-NEXT: vmovdqa %ymm6, (%r8) +; AVX512BW-NEXT: vmovdqa %ymm7, (%r9) +; AVX512BW-NEXT: vmovdqa %ymm3, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride6_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride6_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <96 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 713bd757a7b99..95b5ffde48564 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -321,22 +321,23 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm8 +; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm6 +; AVX512BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm7 +; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,13,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vpermw (%rdi), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovd %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovd %xmm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovd %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovd %xmm7, (%r9) -; AVX512BW-FCP-NEXT: vmovd %xmm3, (%r10) -; AVX512BW-FCP-NEXT: vmovd %xmm8, (%rax) +; AVX512BW-FCP-NEXT: vmovd %xmm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r9) +; AVX512BW-FCP-NEXT: vmovd %xmm1, (%r10) +; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rax) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride7_vf2: @@ -378,22 +379,23 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,13,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermw (%rdi), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm7, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <14 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> @@ -906,28 +908,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-NEXT: vmovq %xmm8, (%rax) +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-NEXT: vmovq %xmm6, (%r10) +; AVX512BW-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -936,28 +937,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r10) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -966,28 +966,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm8, (%rax) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r10) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -996,28 +995,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <28 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index 051b4e300b827..fff21f9aad1bb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -623,31 +623,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r11) -; AVX512BW-NEXT: vmovq %xmm8, (%r10) -; AVX512BW-NEXT: vmovq %xmm9, (%rax) +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-NEXT: vmovq %xmm6, (%r11) +; AVX512BW-NEXT: vmovq %xmm7, (%r10) +; AVX512BW-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -657,31 +656,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512BW-FCP-NEXT: vmovq %xmm8, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm9, (%rax) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -691,31 +689,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r11) -; AVX512DQ-BW-NEXT: vmovq %xmm8, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm9, (%rax) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r11) +; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -725,31 +722,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm9, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <32 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index 7cb46b79f7f36..f2c5a91d2cca3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -363,11 +363,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP-LABEL: load_i32_stride2_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -385,11 +384,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512DQ-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -407,11 +405,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP-LABEL: load_i32_stride2_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512BW-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -429,11 +426,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 213c5febfca23..d9383f524f1d1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -310,128 +310,120 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i32_stride3_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-NEXT: vmovaps %xmm1, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride3_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovaps %xmm1, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride3_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovaps %xmm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride3_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovaps %xmm2, (%rdx) +; AVX512BW-NEXT: vmovaps %xmm1, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%rcx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride3_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%rcx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 61f91b2bb0c0c..0bf1260738439 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -106,13 +106,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) +; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride4_vf2: @@ -134,13 +135,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride4_vf2: @@ -162,13 +164,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf2: @@ -190,13 +193,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <8 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> @@ -361,152 +365,144 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i32_stride4_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512-NEXT: vmovdqa %xmm5, (%r8) +; AVX512-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-NEXT: vmovaps %xmm3, (%rcx) +; AVX512-NEXT: vmovaps %xmm1, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride4_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8) +; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovaps %xmm3, (%rcx) +; AVX512-FCP-NEXT: vmovaps %xmm1, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride4_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8) +; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovaps %xmm3, (%rcx) +; AVX512DQ-NEXT: vmovaps %xmm1, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride4_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) +; AVX512BW-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512BW-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovaps %xmm2, (%rdx) +; AVX512BW-NEXT: vmovaps %xmm3, (%rcx) +; AVX512BW-NEXT: vmovaps %xmm1, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride4_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovaps %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index d8d48b0b8c73d..c08442f9d9d01 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -144,19 +144,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i32_stride5_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512-FCP-NEXT: vmovlps %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -188,19 +188,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i32_stride5_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -232,19 +232,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i32_stride5_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper @@ -276,19 +276,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i32_stride5_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -491,18 +491,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512-NEXT: vmovdqa %xmm5, (%r8) ; AVX512-NEXT: vmovdqa %xmm6, (%r9) @@ -514,18 +513,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9) @@ -537,18 +535,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8) ; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9) @@ -560,18 +557,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9) @@ -583,18 +579,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) @@ -606,18 +601,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) @@ -629,18 +623,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) @@ -652,18 +645,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 3ba41ad07ce83..ae3e5445bf266 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -192,29 +192,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i32_stride6_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0] -; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm3, (%rax) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512-FCP-NEXT: vmovlps %xmm4, (%r9) +; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -252,29 +251,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i32_stride6_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0] -; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -312,29 +310,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i32_stride6_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0] -; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r9) +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -372,29 +369,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i32>, ptr %in.vec, align 64 @@ -1291,352 +1287,360 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i32_stride6_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512-NEXT: vmovdqa %ymm5, (%r8) -; AVX512-NEXT: vmovdqa %ymm8, (%r9) -; AVX512-NEXT: vmovdqa %ymm2, (%rax) +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512-NEXT: vmovdqa %ymm3, (%r8) +; AVX512-NEXT: vmovdqa %ymm6, (%r9) +; AVX512-NEXT: vmovdqa %ymm4, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride6_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride6_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512DQ-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512DQ-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-NEXT: vmovdqa %ymm4, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride6_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512BW-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512BW-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512BW-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512BW-NEXT: vmovdqa %ymm5, (%r8) -; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512BW-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512BW-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512BW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512BW-NEXT: vmovdqa %ymm3, (%r8) +; AVX512BW-NEXT: vmovdqa %ymm6, (%r9) +; AVX512BW-NEXT: vmovdqa %ymm4, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride6_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride6_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512DQ-BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512DQ-BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512DQ-BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512DQ-BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <48 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index d806253ef23a0..694f2bc53c515 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -204,22 +204,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512-NEXT: vmovaps (%rdi), %ymm5 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX512-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512-NEXT: vmovq %xmm2, (%rsi) ; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovq %xmm1, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) -; AVX512-NEXT: vmovq %xmm5, (%rax) +; AVX512-NEXT: vmovlps %xmm1, (%r9) +; AVX512-NEXT: vmovlps %xmm7, (%r10) +; AVX512-NEXT: vmovlps %xmm5, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -227,30 +227,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] -; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) ; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -269,22 +270,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-NEXT: vmovq %xmm5, (%rax) +; AVX512DQ-NEXT: vmovlps %xmm1, (%r9) +; AVX512DQ-NEXT: vmovlps %xmm7, (%r10) +; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -292,30 +293,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) ; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -334,22 +336,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovaps (%rdi), %ymm5 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX512BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX512BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovq %xmm1, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-NEXT: vmovq %xmm5, (%rax) +; AVX512BW-NEXT: vmovlps %xmm1, (%r9) +; AVX512BW-NEXT: vmovlps %xmm7, (%r10) +; AVX512BW-NEXT: vmovlps %xmm5, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -357,30 +359,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] -; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) ; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -399,22 +402,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rax) +; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r9) +; AVX512DQ-BW-NEXT: vmovlps %xmm7, (%r10) +; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -422,30 +425,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <14 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index f0c95f4fa9ef8..8d7f8d1db8522 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -222,24 +222,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512-FCP-NEXT: vmovq %xmm4, (%r10) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512-FCP-NEXT: vmovlps %xmm5, (%r9) +; AVX512-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX512-FCP-NEXT: vmovlps %xmm4, (%r10) +; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -287,24 +288,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r10) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovlps %xmm5, (%r9) +; AVX512DQ-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r10) +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -352,24 +354,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovlps %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r10) +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -417,24 +420,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll index 2381df6d73289..aa7d8ceb14950 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -245,13 +245,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP-LABEL: load_i64_stride2_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX512-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -271,13 +270,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP-LABEL: load_i64_stride2_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -297,13 +295,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP-LABEL: load_i64_stride2_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -323,13 +320,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP-LABEL: load_i64_stride2_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <8 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index f82bcd1ce3e1e..7d3209397c3df 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -611,32 +611,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] -; AVX512-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] +; AVX512-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm4, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -694,32 +693,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] +; AVX512DQ-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -777,32 +775,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] -; AVX512BW-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] +; AVX512BW-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -860,32 +857,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] +; AVX512DQ-BW-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 4e5501b1041d3..cc3e5f3d1d82e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -709,28 +709,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] +; AVX512-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] @@ -739,9 +739,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r10) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -814,28 +814,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] +; AVX512DQ-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512DQ-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] @@ -844,9 +844,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -919,28 +919,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] @@ -949,9 +949,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1024,28 +1024,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] @@ -1054,9 +1054,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 181f5651784d8..acedcf4263906 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1337,10 +1337,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] -; AVX512BW-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1789,10 +1788,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1808,10 +1806,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -1827,10 +1824,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq diff --git a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll index ccc7e93615fed..7cf78a5d54e71 100644 --- a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll +++ b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll @@ -9,6 +9,8 @@ ; RUN: opt < %s -passes='bounds-checking' -S | FileCheck %s --check-prefixes=MINRT-NOMERGE ; RUN: opt < %s -passes='bounds-checking' -S | FileCheck %s --check-prefixes=MINRTABORT-NOMERGE ; +; RUN: opt < %s -passes='bounds-checking' -S | FileCheck %s --check-prefixes=TR-GUARD +; RUN: opt < %s -passes='bounds-checking' -S | FileCheck %s --check-prefixes=RT-GUARD target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" define void @f1(i64 %x) nounwind { @@ -123,6 +125,42 @@ define void @f1(i64 %x) nounwind { ; MINRTABORT-NOMERGE: [[TRAP]]: ; MINRTABORT-NOMERGE-NEXT: call void @__ubsan_handle_local_out_of_bounds_minimal_abort() #[[ATTR2:[0-9]+]], !nosanitize [[META0]] ; MINRTABORT-NOMERGE-NEXT: unreachable, !nosanitize [[META0]] +; +; TR-GUARD-LABEL: define void @f1( +; TR-GUARD-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; TR-GUARD-NEXT: [[TMP1:%.*]] = mul i64 16, [[X]] +; TR-GUARD-NEXT: [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8 +; TR-GUARD-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]] +; TR-GUARD-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]] +; TR-GUARD-NEXT: [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]] +; TR-GUARD-NEXT: [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]] +; TR-GUARD-NEXT: [[TMP7:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META0]] +; TR-GUARD-NEXT: [[TMP8:%.*]] = and i1 [[TMP6]], [[TMP7]], !nosanitize [[META0]] +; TR-GUARD-NEXT: br i1 [[TMP8]], label %[[TRAP:.*]], label %[[BB9:.*]] +; TR-GUARD: [[BB9]]: +; TR-GUARD-NEXT: [[TMP10:%.*]] = load i128, ptr [[TMP2]], align 4 +; TR-GUARD-NEXT: ret void +; TR-GUARD: [[TRAP]]: +; TR-GUARD-NEXT: call void @llvm.ubsantrap(i8 3) #[[ATTR3:[0-9]+]], !nosanitize [[META0]] +; TR-GUARD-NEXT: unreachable, !nosanitize [[META0]] +; +; RT-GUARD-LABEL: define void @f1( +; RT-GUARD-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; RT-GUARD-NEXT: [[TMP1:%.*]] = mul i64 16, [[X]] +; RT-GUARD-NEXT: [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8 +; RT-GUARD-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]] +; RT-GUARD-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]] +; RT-GUARD-NEXT: [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]] +; RT-GUARD-NEXT: [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]] +; RT-GUARD-NEXT: [[TMP7:%.*]] = call i1 @llvm.allow.ubsan.check(i8 -5), !nosanitize [[META0]] +; RT-GUARD-NEXT: [[TMP8:%.*]] = and i1 [[TMP6]], [[TMP7]], !nosanitize [[META0]] +; RT-GUARD-NEXT: br i1 [[TMP8]], label %[[TRAP:.*]], label %[[BB9:.*]] +; RT-GUARD: [[BB9]]: +; RT-GUARD-NEXT: [[TMP10:%.*]] = load i128, ptr [[TMP2]], align 4 +; RT-GUARD-NEXT: ret void +; RT-GUARD: [[TRAP]]: +; RT-GUARD-NEXT: call void @__ubsan_handle_local_out_of_bounds() #[[ATTR2:[0-9]+]], !nosanitize [[META0]] +; RT-GUARD-NEXT: br label %[[BB9]], !nosanitize [[META0]] ; %1 = alloca i128, i64 %x %3 = load i128, ptr %1, align 4 @@ -154,6 +192,15 @@ define void @f1(i64 %x) nounwind { ; MINRTABORT-NOMERGE: attributes #[[ATTR1:[0-9]+]] = { noreturn nounwind } ; MINRTABORT-NOMERGE: attributes #[[ATTR2]] = { nomerge noreturn nounwind } ;. +; TR-GUARD: attributes #[[ATTR0]] = { nounwind } +; TR-GUARD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +; TR-GUARD: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind } +; TR-GUARD: attributes #[[ATTR3]] = { nomerge noreturn nounwind } +;. +; RT-GUARD: attributes #[[ATTR0]] = { nounwind } +; RT-GUARD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +; RT-GUARD: attributes #[[ATTR2]] = { nomerge nounwind } +;. ; TR: [[META0]] = !{} ;. ; RT: [[META0]] = !{} @@ -168,3 +215,7 @@ define void @f1(i64 %x) nounwind { ;. ; MINRTABORT-NOMERGE: [[META0]] = !{} ;. +; TR-GUARD: [[META0]] = !{} +;. +; RT-GUARD: [[META0]] = !{} +;. diff --git a/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s b/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s index 08a589e1f963f..9e40830882c87 100644 --- a/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s +++ b/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s @@ -1,4 +1,4 @@ -// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm 2>&1 < %s| FileCheck %s +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm 2>&1 < %s| FileCheck %s // ------------------------------------------------------------------------- // diff --git a/llvm/test/MC/AArch64/SVE2/bdep.s b/llvm/test/MC/AArch64/SVE2/bdep.s index a6ef95d9f2619..44c848d0b3b59 100644 --- a/llvm/test/MC/AArch64/SVE2/bdep.s +++ b/llvm/test/MC/AArch64/SVE2/bdep.s @@ -1,34 +1,36 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN bdep z0.b, z1.b, z31.b // CHECK-INST: bdep z0.b, z1.b, z31.b // CHECK-ENCODING: [0x20,0xb4,0x1f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 451fb420 bdep z0.h, z1.h, z31.h // CHECK-INST: bdep z0.h, z1.h, z31.h // CHECK-ENCODING: [0x20,0xb4,0x5f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 455fb420 bdep z0.s, z1.s, z31.s // CHECK-INST: bdep z0.s, z1.s, z31.s // CHECK-ENCODING: [0x20,0xb4,0x9f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 459fb420 bdep z0.d, z1.d, z31.d // CHECK-INST: bdep z0.d, z1.d, z31.d // CHECK-ENCODING: [0x20,0xb4,0xdf,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 45dfb420 diff --git a/llvm/test/MC/AArch64/SVE2/bext.s b/llvm/test/MC/AArch64/SVE2/bext.s index 43272205ab897..ea519c22cceb5 100644 --- a/llvm/test/MC/AArch64/SVE2/bext.s +++ b/llvm/test/MC/AArch64/SVE2/bext.s @@ -1,34 +1,36 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN bext z0.b, z1.b, z31.b // CHECK-INST: bext z0.b, z1.b, z31.b // CHECK-ENCODING: [0x20,0xb0,0x1f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 451fb020 bext z0.h, z1.h, z31.h // CHECK-INST: bext z0.h, z1.h, z31.h // CHECK-ENCODING: [0x20,0xb0,0x5f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 455fb020 bext z0.s, z1.s, z31.s // CHECK-INST: bext z0.s, z1.s, z31.s // CHECK-ENCODING: [0x20,0xb0,0x9f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 459fb020 bext z0.d, z1.d, z31.d // CHECK-INST: bext z0.d, z1.d, z31.d // CHECK-ENCODING: [0x20,0xb0,0xdf,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 45dfb020 diff --git a/llvm/test/MC/AArch64/SVE2/bgrp.s b/llvm/test/MC/AArch64/SVE2/bgrp.s index fb96946dc3c53..eb58d13511583 100644 --- a/llvm/test/MC/AArch64/SVE2/bgrp.s +++ b/llvm/test/MC/AArch64/SVE2/bgrp.s @@ -1,34 +1,36 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN bgrp z0.b, z1.b, z31.b // CHECK-INST: bgrp z0.b, z1.b, z31.b // CHECK-ENCODING: [0x20,0xb8,0x1f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 451fb820 bgrp z0.h, z1.h, z31.h // CHECK-INST: bgrp z0.h, z1.h, z31.h // CHECK-ENCODING: [0x20,0xb8,0x5f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 455fb820 bgrp z0.s, z1.s, z31.s // CHECK-INST: bgrp z0.s, z1.s, z31.s // CHECK-ENCODING: [0x20,0xb8,0x9f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 459fb820 bgrp z0.d, z1.d, z31.d // CHECK-INST: bgrp z0.d, z1.d, z31.d // CHECK-ENCODING: [0x20,0xb8,0xdf,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 45dfb820 diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s index 090d8af85825a..2cfce3b232ffc 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s @@ -29,10 +29,16 @@ rax1 z0.d, z0.d, z0.d // CHECK: error: instruction requires: sve2-sha3 // CHECK-NEXT: rax1 z0.d, z0.d, z0.d -.arch armv9-a+sve2-bitperm -.arch armv9-a+nosve2-bitperm +.arch armv9-a+ssve-bitperm +.arch armv9-a+nossve-bitperm bgrp z21.s, z10.s, z21.s -// CHECK: error: instruction requires: sve2-bitperm +// CHECK: error: instruction requires: sve-bitperm +// CHECK-NEXT: bgrp z21.s, z10.s, z21.s + +.arch armv9-a+sve2+sve-bitperm +.arch armv9-a+sve2+nosve-bitperm +bgrp z21.s, z10.s, z21.s +// CHECK: error: instruction requires: sve-bitperm // CHECK-NEXT: bgrp z21.s, z10.s, z21.s .arch armv9-a+f8f16mm diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch.s b/llvm/test/MC/AArch64/SVE2/directive-arch.s index 1319a8a186971..203541a09ad37 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch.s @@ -20,7 +20,11 @@ sm4e z0.s, z0.s, z0.s rax1 z0.d, z0.d, z0.d // CHECK: rax1 z0.d, z0.d, z0.d -.arch armv9-a+sve2-bitperm +.arch armv9-a+sve2+sve-bitperm +bgrp z21.s, z10.s, z21.s +// CHECK: bgrp z21.s, z10.s, z21.s + +.arch armv9-a+ssve-bitperm bgrp z21.s, z10.s, z21.s // CHECK: bgrp z21.s, z10.s, z21.s diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s index 2eb22ebf7428c..2fab61597576f 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s @@ -34,7 +34,13 @@ rax1 z0.d, z0.d, z0.d .arch_extension sve2-bitperm .arch_extension nosve2-bitperm bgrp z21.s, z10.s, z21.s -// CHECK: error: instruction requires: sve2-bitperm +// CHECK: error: instruction requires: sve2 or ssve-bitperm sve-bitperm +// CHECK-NEXT: bgrp z21.s, z10.s, z21.s + +.arch_extension sve2-bitperm +.arch_extension nosve2 +bgrp z21.s, z10.s, z21.s +// CHECK: error: instruction requires: sve2 or ssve-bitperm // CHECK-NEXT: bgrp z21.s, z10.s, z21.s .arch_extension f8f16mm diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s index ce56127ca93b1..e45e1f9881422 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s @@ -20,7 +20,7 @@ sm4e z0.s, z0.s, z0.s rax1 z0.d, z0.d, z0.d // CHECK: rax1 z0.d, z0.d, z0.d -.arch_extension sve2-bitperm +.arch_extension ssve-bitperm bgrp z21.s, z10.s, z21.s // CHECK: bgrp z21.s, z10.s, z21.s diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s index 461b9298df621..a50b990949424 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s @@ -29,10 +29,16 @@ rax1 z0.d, z0.d, z0.d // CHECK: error: instruction requires: sve2-sha3 // CHECK-NEXT: rax1 z0.d, z0.d, z0.d -.cpu generic+sve2-bitperm -.cpu generic+nosve2-bitperm +.cpu generic+sve2+sve-bitperm +.cpu generic+sve2+nosve-bitperm bgrp z21.s, z10.s, z21.s -// CHECK: error: instruction requires: sve2-bitperm +// CHECK: error: instruction requires: sve-bitperm +// CHECK-NEXT: bgrp z21.s, z10.s, z21.s + +.cpu generic+ssve-bitperm +.cpu generic+nossve-bitperm +bgrp z21.s, z10.s, z21.s +// CHECK: error: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-NEXT: bgrp z21.s, z10.s, z21.s .cpu generic+sve2+f8f16mm diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu.s b/llvm/test/MC/AArch64/SVE2/directive-cpu.s index c54a3a9f272c3..0d873dd9b53f1 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-cpu.s +++ b/llvm/test/MC/AArch64/SVE2/directive-cpu.s @@ -20,7 +20,11 @@ sm4e z0.s, z0.s, z0.s rax1 z0.d, z0.d, z0.d // CHECK: rax1 z0.d, z0.d, z0.d -.cpu generic+sve2-bitperm +.cpu generic+sve2+sve-bitperm +bgrp z21.s, z10.s, z21.s +// CHECK: bgrp z21.s, z10.s, z21.s + +.cpu generic+ssve-bitperm bgrp z21.s, z10.s, z21.s // CHECK: bgrp z21.s, z10.s, z21.s diff --git a/llvm/test/MC/AArch64/win-import-call-optimization.s b/llvm/test/MC/AArch64/win-import-call-optimization.s deleted file mode 100644 index f26e17b9b62cc..0000000000000 --- a/llvm/test/MC/AArch64/win-import-call-optimization.s +++ /dev/null @@ -1,72 +0,0 @@ -// RUN: llvm-mc -triple aarch64-windows-msvc -filetype obj -o %t.obj %s -// RUN: llvm-readobj --sections --sd --relocs %t.obj | FileCheck %s - -.section nc_sect,"xr" -normal_call: - str x30, [sp, #-16]! // 8-byte Folded Spill - adrp x8, __imp_a - ldr x8, [x8, :lo12:__imp_a] -.Limpcall0: - blr x8 - ldr x30, [sp], #16 // 8-byte Folded Reload - ret - -.section tc_sect,"xr" -tail_call: - adrp x8, __imp_b - ldr x8, [x8, :lo12:__imp_b] -.Limpcall1: - br x8 - -.section .impcall,"yi" -.asciz "Imp_Call_V1" -.word 20 -.secnum nc_sect -.word 19 -.secoffset .Limpcall0 -.symidx __imp_a -.word 20 -.secnum tc_sect -.word 19 -.secoffset .Limpcall1 -.symidx __imp_b - -// CHECK-LABEL: Name: .impcall (2E 69 6D 70 63 61 6C 6C) -// CHECK-NEXT: VirtualSize: 0x0 -// CHECK-NEXT: VirtualAddress: 0x0 -// CHECK-NEXT: RawDataSize: 52 -// CHECK-NEXT: PointerToRawData: 0x150 -// CHECK-NEXT: PointerToRelocations: 0x0 -// CHECK-NEXT: PointerToLineNumbers: 0x0 -// CHECK-NEXT: RelocationCount: 0 -// CHECK-NEXT: LineNumberCount: 0 -// CHECK-NEXT: Characteristics [ -// CHECK-NEXT: IMAGE_SCN_ALIGN_4BYTES -// CHECK-NEXT: IMAGE_SCN_LNK_INFO -// CHECK-NEXT: ] -// CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 496D705F 43616C6C 5F563100 14000000 |Imp_Call_V1.....| -// CHECK-NEXT: 0010: -// CHECK-SAME: [[#%.2X,NCSECT:]]000000 -// CHECK-SAME: 13000000 -// CHECK-SAME: [[#%.2X,NCOFFSET:]]000000 -// CHECK-SAME: [[#%.2X,NCSYM:]]000000 -// CHECK-NEXT: 0020: -// CHECK-SAME: 14000000 -// CHECK-SAME: [[#%.2X,TCSECT:]]000000 -// CHECK-SAME: 13000000 -// CHECK-SAME: [[#%.2X,TCOFFSET:]]000000 -// CHECK-NEXT: 0030: -// CHECK-SAME: [[#%.2X,TCSYM:]]000000 -// CHECK-NEXT: ) - -// CHECK-LABEL: Relocations [ -// CHECK-NEXT: Section ([[#%u,NCSECT]]) nc_sect { -// CHECK-NEXT: 0x[[#%x,NCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_a ([[#%u,NCSYM]]) -// CHECK-NEXT: 0x[[#%x,NCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_a ([[#%u,NCSYM]]) -// CHECK-NEXT: } -// CHECK-NEXT: Section ([[#%u,TCSECT]]) tc_sect { -// CHECK-NEXT: 0x[[#%x,TCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_b ([[#%u,TCSYM]]) -// CHECK-NEXT: 0x[[#%x,TCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_b ([[#%u,TCSYM]]) -// CHECK-NEXT: } -// CHECK-NEXT: ] diff --git a/llvm/test/MC/COFF/bad-parse.s b/llvm/test/MC/COFF/bad-parse.s deleted file mode 100644 index 2491f41abeb4e..0000000000000 --- a/llvm/test/MC/COFF/bad-parse.s +++ /dev/null @@ -1,13 +0,0 @@ -// RUN: not llvm-mc -filetype=obj -triple i386-pc-win32 %s 2>&1 | FileCheck %s - - .data - -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive - .secnum -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive - .secnum section extra - -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive - .secoffset -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive - .secoffset section extra diff --git a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt index e9a9f1327a17e..53ae3b8b73ab4 100644 --- a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt +++ b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt @@ -90,7 +90,7 @@ ## MRM5m # ATT: vscatterpf0dps (%r16,%zmm0) {%k1} -# INTEL: vscatterpf0dps {k1}, zmmword ptr [r16 + zmm0] +# INTEL: vscatterpf0dps {k1}, dword ptr [r16 + zmm0] 0x62,0xfa,0x7d,0x49,0xc6,0x2c,0x00 # ATT: subq $127, 123(%r16), %r17 diff --git a/llvm/test/MC/Disassembler/X86/intel-syntax.txt b/llvm/test/MC/Disassembler/X86/intel-syntax.txt index c7c0fce268cd2..f9284ab388441 100644 --- a/llvm/test/MC/Disassembler/X86/intel-syntax.txt +++ b/llvm/test/MC/Disassembler/X86/intel-syntax.txt @@ -108,10 +108,10 @@ # CHECK: vshufpd xmm0, xmm1, xmm2, 1 0xc5 0xf1 0xc6 0xc2 0x01 -# CHECK: vpgatherqq ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 +# CHECK: vpgatherqq ymm2, qword ptr [rdi + 2*ymm1], ymm0 0xc4 0xe2 0xfd 0x91 0x14 0x4f -# CHECK: vpgatherdd xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 +# CHECK: vpgatherdd xmm10, dword ptr [r15 + 2*xmm9], xmm8 0xc4 0x02 0x39 0x90 0x14 0x4f # CHECK: xsave64 [rax] diff --git a/llvm/test/MC/RISCV/xqciint-invalid.s b/llvm/test/MC/RISCV/xqciint-invalid.s new file mode 100644 index 0000000000000..e748109f41d82 --- /dev/null +++ b/llvm/test/MC/RISCV/xqciint-invalid.s @@ -0,0 +1,105 @@ +# Xqciint - Qualcomm uC Interrupts extension +# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqciint < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-IMM %s +# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqciint < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-EXT %s + +# CHECK-IMM: :[[@LINE+1]]:12: error: immediate must be an integer in the range [0, 1023] +qc.setinti 1025 + +# CHECK: :[[@LINE+1]]:16: error: invalid operand for instruction +qc.setinti 11, 12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.setinti + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.setinti 10 + + +# CHECK-IMM: :[[@LINE+1]]:12: error: immediate must be an integer in the range [0, 1023] +qc.clrinti 2000 + +# CHECK: :[[@LINE+1]]:16: error: invalid operand for instruction +qc.clrinti 22, x4 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.clrinti + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.clrinti 8 + + +# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction +qc.c.clrint 22 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.c.clrint + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.clrint x8 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.c.di 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.di + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.c.dir 22 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.c.dir + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.dir x8 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.c.ei 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.ei + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.c.eir 22 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.c.eir + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.eir x8 + + +# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction +qc.c.mienter.nest 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.mienter.nest + + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.c.mienter 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.mienter + + +# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction +qc.c.mileaveret 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.mileaveret + + +# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction +qc.c.setint 22 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.c.setint + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.setint x8 diff --git a/llvm/test/MC/RISCV/xqciint-valid.s b/llvm/test/MC/RISCV/xqciint-valid.s new file mode 100644 index 0000000000000..c05a402b5b14a --- /dev/null +++ b/llvm/test/MC/RISCV/xqciint-valid.s @@ -0,0 +1,81 @@ +# Xqciint - Qualcomm uC Interrupts extension +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciint -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciint < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqciint -M no-aliases --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciint -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciint < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqciint --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s + +# CHECK-INST: qc.setinti 500 +# CHECK-ENC: encoding: [0x73,0x00,0xfa,0xcc] +qc.setinti 500 + +# CHECK-INST: qc.setinti 0 +# CHECK-ENC: encoding: [0x73,0x00,0x00,0xcc] +qc.setinti 0 + +# CHECK-INST: qc.setinti 1023 +# CHECK-ENC: encoding: [0x73,0x80,0xff,0xcd] +qc.setinti 1023 + + +# CHECK-INST: qc.clrinti 500 +# CHECK-ENC: encoding: [0x73,0x00,0xfa,0xce] +qc.clrinti 500 + +# CHECK-INST: qc.clrinti 1023 +# CHECK-ENC: encoding: [0x73,0x80,0xff,0xcf] +qc.clrinti 1023 + +# CHECK-INST: qc.clrinti 0 +# CHECK-ENC: encoding: [0x73,0x00,0x00,0xce] +qc.clrinti 0 + + +# CHECK-INST: qc.c.clrint a0 +# CHECK-ENC: encoding: [0x0e,0x15] +qc.c.clrint x10 + + +# CHECK-INST: qc.c.di +# CHECK-ENC: encoding: [0x12,0x1b] +qc.c.di + + +# CHECK-INST: qc.c.dir a0 +# CHECK-ENC: encoding: [0x02,0x15] +qc.c.dir x10 + + +# CHECK-INST: qc.c.ei +# CHECK-ENC: encoding: [0x92,0x1b] +qc.c.ei + + +# CHECK-INST: qc.c.eir a0 +# CHECK-ENC: encoding: [0x06,0x15] +qc.c.eir x10 + + +# CHECK-INST: qc.c.mienter.nest +# CHECK-ENC: encoding: [0x92,0x18] +qc.c.mienter.nest + + +# CHECK-INST: qc.c.mienter +# CHECK-ENC: encoding: [0x12,0x18] +qc.c.mienter + + +# CHECK-INST: qc.c.mileaveret +# CHECK-ENC: encoding: [0x12,0x1a] +qc.c.mileaveret + + +# CHECK-INST: qc.c.setint a0 +# CHECK-ENC: encoding: [0x0a,0x15] +qc.c.setint x10 diff --git a/llvm/test/MC/X86/avx-64-intel.s b/llvm/test/MC/X86/avx-64-intel.s index c1f20d204a8c4..392f6e9928427 100644 --- a/llvm/test/MC/X86/avx-64-intel.s +++ b/llvm/test/MC/X86/avx-64-intel.s @@ -1,68 +1,68 @@ // RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s -// CHECK: vgatherdpd xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 +// CHECK: vgatherdpd xmm2, qword ptr [rdi + 2*xmm1], xmm0 // CHECK: encoding: [0xc4,0xe2,0xf9,0x92,0x14,0x4f] - vgatherdpd xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 + vgatherdpd xmm2, qword ptr [rdi + 2*xmm1], xmm0 -// CHECK: vgatherqpd xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 +// CHECK: vgatherqpd xmm2, qword ptr [rdi + 2*xmm1], xmm0 // CHECK: encoding: [0xc4,0xe2,0xf9,0x93,0x14,0x4f] - vgatherqpd xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 + vgatherqpd xmm2, qword ptr [rdi + 2*xmm1], xmm0 -// CHECK: vgatherdpd ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 +// CHECK: vgatherdpd ymm2, qword ptr [rdi + 2*xmm1], ymm0 // CHECK: encoding: [0xc4,0xe2,0xfd,0x92,0x14,0x4f] - vgatherdpd ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 + vgatherdpd ymm2, qword ptr [rdi + 2*xmm1], ymm0 -// CHECK: vgatherqpd ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 +// CHECK: vgatherqpd ymm2, qword ptr [rdi + 2*ymm1], ymm0 // CHECK: encoding: [0xc4,0xe2,0xfd,0x93,0x14,0x4f] - vgatherqpd ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 + vgatherqpd ymm2, qword ptr [rdi + 2*ymm1], ymm0 -// CHECK: vgatherdps xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 +// CHECK: vgatherdps xmm10, dword ptr [r15 + 2*xmm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x39,0x92,0x14,0x4f] - vgatherdps xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 + vgatherdps xmm10, dword ptr [r15 + 2*xmm9], xmm8 -// CHECK: vgatherqps xmm10, qword ptr [r15 + 2*xmm9], xmm8 +// CHECK: vgatherqps xmm10, dword ptr [r15 + 2*xmm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x39,0x93,0x14,0x4f] - vgatherqps xmm10, qword ptr [r15 + 2*xmm9], xmm8 + vgatherqps xmm10, dword ptr [r15 + 2*xmm9], xmm8 -// CHECK: vgatherdps ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 +// CHECK: vgatherdps ymm10, dword ptr [r15 + 2*ymm9], ymm8 // CHECK: encoding: [0xc4,0x02,0x3d,0x92,0x14,0x4f] - vgatherdps ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 + vgatherdps ymm10, dword ptr [r15 + 2*ymm9], ymm8 -// CHECK: vgatherqps xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 +// CHECK: vgatherqps xmm10, dword ptr [r15 + 2*ymm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x3d,0x93,0x14,0x4f] - vgatherqps xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 + vgatherqps xmm10, dword ptr [r15 + 2*ymm9], xmm8 -// CHECK: vpgatherdq xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 +// CHECK: vpgatherdq xmm2, qword ptr [rdi + 2*xmm1], xmm0 // CHECK: encoding: [0xc4,0xe2,0xf9,0x90,0x14,0x4f] - vpgatherdq xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 + vpgatherdq xmm2, qword ptr [rdi + 2*xmm1], xmm0 -// CHECK: vpgatherqq xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 +// CHECK: vpgatherqq xmm2, qword ptr [rdi + 2*xmm1], xmm0 // CHECK: encoding: [0xc4,0xe2,0xf9,0x91,0x14,0x4f] - vpgatherqq xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 + vpgatherqq xmm2, qword ptr [rdi + 2*xmm1], xmm0 -// CHECK: vpgatherdq ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 +// CHECK: vpgatherdq ymm2, qword ptr [rdi + 2*xmm1], ymm0 // CHECK: encoding: [0xc4,0xe2,0xfd,0x90,0x14,0x4f] - vpgatherdq ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 + vpgatherdq ymm2, qword ptr [rdi + 2*xmm1], ymm0 -// CHECK: vpgatherqq ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 +// CHECK: vpgatherqq ymm2, qword ptr [rdi + 2*ymm1], ymm0 // CHECK: encoding: [0xc4,0xe2,0xfd,0x91,0x14,0x4f] - vpgatherqq ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 + vpgatherqq ymm2, qword ptr [rdi + 2*ymm1], ymm0 -// CHECK: vpgatherdd xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 +// CHECK: vpgatherdd xmm10, dword ptr [r15 + 2*xmm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x39,0x90,0x14,0x4f] - vpgatherdd xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 + vpgatherdd xmm10, dword ptr [r15 + 2*xmm9], xmm8 -// CHECK: vpgatherqd xmm10, qword ptr [r15 + 2*xmm9], xmm8 +// CHECK: vpgatherqd xmm10, dword ptr [r15 + 2*xmm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x39,0x91,0x14,0x4f] - vpgatherqd xmm10, qword ptr [r15 + 2*xmm9], xmm8 + vpgatherqd xmm10, dword ptr [r15 + 2*xmm9], xmm8 -// CHECK: vpgatherdd ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 +// CHECK: vpgatherdd ymm10, dword ptr [r15 + 2*ymm9], ymm8 // CHECK: encoding: [0xc4,0x02,0x3d,0x90,0x14,0x4f] - vpgatherdd ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 + vpgatherdd ymm10, dword ptr [r15 + 2*ymm9], ymm8 -// CHECK: vpgatherqd xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 +// CHECK: vpgatherqd xmm10, dword ptr [r15 + 2*ymm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x3d,0x91,0x14,0x4f] - vpgatherqd xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 + vpgatherqd xmm10, dword ptr [r15 + 2*ymm9], xmm8 // CHECK: vcvtpd2ps xmm0, xmm15 // CHECK: encoding: [0xc4,0xc1,0x79,0x5a,0xc7] diff --git a/llvm/test/MC/X86/avx512-intel.s b/llvm/test/MC/X86/avx512-intel.s index d8ad3c4426176..1cbf21c7eb1b0 100644 --- a/llvm/test/MC/X86/avx512-intel.s +++ b/llvm/test/MC/X86/avx512-intel.s @@ -37900,450 +37900,450 @@ vaddpd zmm1, zmm1, zmm2, {rz-sae} // CHECK: encoding: [0x62,0xe2,0xa5,0x50,0x77,0xaa,0xf8,0xfb,0xff,0xff] vpermi2pd zmm21, zmm27, qword ptr [rdx - 1032]{1to8} -// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [r14 + 8*ymm16 + 123] +// CHECK: vgatherdpd zmm6 {k1}, qword ptr [r14 + 8*ymm16 + 123] // CHECK: encoding: [0x62,0xd2,0xfd,0x41,0x92,0xb4,0xc6,0x7b,0x00,0x00,0x00] - vgatherdpd zmm6 {k1},ZMMWORD PTR [r14+ymm16*8+0x7b] + vgatherdpd zmm6 {k1},QWORD PTR [r14+ymm16*8+0x7b] -// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [r9 + ymm16 + 256] +// CHECK: vgatherdpd zmm6 {k1}, qword ptr [r9 + ymm16 + 256] // CHECK: encoding: [0x62,0xd2,0xfd,0x41,0x92,0x74,0x01,0x20] - vgatherdpd zmm6{k1},ZMMWORD PTR [r9+ymm16*1+0x100] + vgatherdpd zmm6{k1},QWORD PTR [r9+ymm16*1+0x100] -// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [rcx + 4*ymm16 + 1024] +// CHECK: vgatherdpd zmm6 {k1}, qword ptr [rcx + 4*ymm16 + 1024] // CHECK: encoding: [0x62,0xf2,0xfd,0x41,0x92,0xb4,0x81,0x00,0x04,0x00,0x00] - vgatherdpd zmm6{k1},ZMMWORD PTR [rcx+ymm16*4+0x400] + vgatherdpd zmm6{k1},QWORD PTR [rcx+ymm16*4+0x400] -// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [r14 + 8*zmm19 + 123] +// CHECK: vgatherdps zmm9 {k1}, dword ptr [r14 + 8*zmm19 + 123] // CHECK: encoding: [0x62,0x52,0x7d,0x41,0x92,0x8c,0xde,0x7b,0x00,0x00,0x00] - vgatherdps zmm9{k1},ZMMWORD PTR [r14+zmm19*8+0x7b] + vgatherdps zmm9{k1},DWORD PTR [r14+zmm19*8+0x7b] -// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [r9 + zmm19 + 256] +// CHECK: vgatherdps zmm9 {k1}, dword ptr [r9 + zmm19 + 256] // CHECK: encoding: [0x62,0x52,0x7d,0x41,0x92,0x4c,0x19,0x40] - vgatherdps zmm9{k1},ZMMWORD PTR [r9+zmm19*1+0x100] + vgatherdps zmm9{k1},DWORD PTR [r9+zmm19*1+0x100] -// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [rcx + 4*zmm19 + 1024] +// CHECK: vgatherdps zmm9 {k1}, dword ptr [rcx + 4*zmm19 + 1024] // CHECK: encoding: [0x62,0x72,0x7d,0x41,0x92,0x8c,0x99,0x00,0x04,0x00,0x00] - vgatherdps zmm9{k1},ZMMWORD PTR [rcx+zmm19*4+0x400] + vgatherdps zmm9{k1},DWORD PTR [rcx+zmm19*4+0x400] -// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [r14 + 8*zmm2 + 123] +// CHECK: vgatherqpd zmm29 {k1}, qword ptr [r14 + 8*zmm2 + 123] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x93,0xac,0xd6,0x7b,0x00,0x00,0x00] - vgatherqpd zmm29{k1},ZMMWORD PTR [r14+zmm2*8+0x7b] + vgatherqpd zmm29{k1},QWORD PTR [r14+zmm2*8+0x7b] -// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [r9 + zmm2 + 256] +// CHECK: vgatherqpd zmm29 {k1}, qword ptr [r9 + zmm2 + 256] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x93,0x6c,0x11,0x20] - vgatherqpd zmm29{k1},ZMMWORD PTR [r9+zmm2*1+0x100] + vgatherqpd zmm29{k1},QWORD PTR [r9+zmm2*1+0x100] -// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [rcx + 4*zmm2 + 1024] +// CHECK: vgatherqpd zmm29 {k1}, qword ptr [rcx + 4*zmm2 + 1024] // CHECK: encoding: [0x62,0x62,0xfd,0x49,0x93,0xac,0x91,0x00,0x04,0x00,0x00] - vgatherqpd zmm29{k1},ZMMWORD PTR [rcx+zmm2*4+0x400] + vgatherqpd zmm29{k1},QWORD PTR [rcx+zmm2*4+0x400] -// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [r14 + 8*zmm4 + 123] +// CHECK: vgatherqps ymm18 {k1}, dword ptr [r14 + 8*zmm4 + 123] // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0x93,0x94,0xe6,0x7b,0x00,0x00,0x00] - vgatherqps ymm18{k1},YMMWORD PTR [r14+zmm4*8+0x7b] + vgatherqps ymm18{k1},DWORD PTR [r14+zmm4*8+0x7b] -// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [r9 + zmm4 + 256] +// CHECK: vgatherqps ymm18 {k1}, dword ptr [r9 + zmm4 + 256] // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0x93,0x54,0x21,0x40] - vgatherqps ymm18{k1},YMMWORD PTR [r9+zmm4*1+0x100] + vgatherqps ymm18{k1},DWORD PTR [r9+zmm4*1+0x100] -// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [rcx + 4*zmm4 + 1024] +// CHECK: vgatherqps ymm18 {k1}, dword ptr [rcx + 4*zmm4 + 1024] // CHECK: encoding: [0x62,0xe2,0x7d,0x49,0x93,0x94,0xa1,0x00,0x04,0x00,0x00] - vgatherqps ymm18{k1},YMMWORD PTR [rcx+zmm4*4+0x400] + vgatherqps ymm18{k1},DWORD PTR [rcx+zmm4*4+0x400] -// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [r14 + 8*zmm11 + 123] +// CHECK: vpgatherdd zmm17 {k1}, dword ptr [r14 + 8*zmm11 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x49,0x90,0x8c,0xde,0x7b,0x00,0x00,0x00] - vpgatherdd zmm17{k1},ZMMWORD PTR [r14+zmm11*8+0x7b] + vpgatherdd zmm17{k1},DWORD PTR [r14+zmm11*8+0x7b] -// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [r9 + zmm11 + 256] +// CHECK: vpgatherdd zmm17 {k1}, dword ptr [r9 + zmm11 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x49,0x90,0x4c,0x19,0x40] - vpgatherdd zmm17{k1},ZMMWORD PTR [r9+zmm11*1+0x100] + vpgatherdd zmm17{k1},DWORD PTR [r9+zmm11*1+0x100] -// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [rcx + 4*zmm11 + 1024] +// CHECK: vpgatherdd zmm17 {k1}, dword ptr [rcx + 4*zmm11 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x49,0x90,0x8c,0x99,0x00,0x04,0x00,0x00] - vpgatherdd zmm17{k1},ZMMWORD PTR [rcx+zmm11*4+0x400] + vpgatherdd zmm17{k1},DWORD PTR [rcx+zmm11*4+0x400] -// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [r14 + 8*ymm14 + 123] +// CHECK: vpgatherdq zmm8 {k1}, qword ptr [r14 + 8*ymm14 + 123] // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x90,0x84,0xf6,0x7b,0x00,0x00,0x00] - vpgatherdq zmm8{k1},ZMMWORD PTR [r14+ymm14*8+0x7b] + vpgatherdq zmm8{k1},QWORD PTR [r14+ymm14*8+0x7b] -// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [r9 + ymm14 + 256] +// CHECK: vpgatherdq zmm8 {k1}, qword ptr [r9 + ymm14 + 256] // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x90,0x44,0x31,0x20] - vpgatherdq zmm8{k1},ZMMWORD PTR [r9+ymm14*1+0x100] + vpgatherdq zmm8{k1},QWORD PTR [r9+ymm14*1+0x100] -// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [rcx + 4*ymm14 + 1024] +// CHECK: vpgatherdq zmm8 {k1}, qword ptr [rcx + 4*ymm14 + 1024] // CHECK: encoding: [0x62,0x32,0xfd,0x49,0x90,0x84,0xb1,0x00,0x04,0x00,0x00] - vpgatherdq zmm8{k1},ZMMWORD PTR [rcx+ymm14*4+0x400] + vpgatherdq zmm8{k1},QWORD PTR [rcx+ymm14*4+0x400] -// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [r14 + 8*zmm17 + 123] +// CHECK: vpgatherqd ymm3 {k1}, dword ptr [r14 + 8*zmm17 + 123] // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x91,0x9c,0xce,0x7b,0x00,0x00,0x00] - vpgatherqd ymm3{k1},YMMWORD PTR [r14+zmm17*8+0x7b] + vpgatherqd ymm3{k1},DWORD PTR [r14+zmm17*8+0x7b] -// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [r9 + zmm17 + 256] +// CHECK: vpgatherqd ymm3 {k1}, dword ptr [r9 + zmm17 + 256] // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x91,0x5c,0x09,0x40] - vpgatherqd ymm3{k1},YMMWORD PTR [r9+zmm17*1+0x100] + vpgatherqd ymm3{k1},DWORD PTR [r9+zmm17*1+0x100] -// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [rcx + 4*zmm17 + 1024] +// CHECK: vpgatherqd ymm3 {k1}, dword ptr [rcx + 4*zmm17 + 1024] // CHECK: encoding: [0x62,0xf2,0x7d,0x41,0x91,0x9c,0x89,0x00,0x04,0x00,0x00] - vpgatherqd ymm3{k1},YMMWORD PTR [rcx+zmm17*4+0x400] + vpgatherqd ymm3{k1},DWORD PTR [rcx+zmm17*4+0x400] -// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [r14 + 8*zmm21 + 123] +// CHECK: vpgatherqq zmm17 {k1}, qword ptr [r14 + 8*zmm21 + 123] // CHECK: encoding: [0x62,0xc2,0xfd,0x41,0x91,0x8c,0xee,0x7b,0x00,0x00,0x00] - vpgatherqq zmm17{k1},ZMMWORD PTR [r14+zmm21*8+0x7b] + vpgatherqq zmm17{k1},QWORD PTR [r14+zmm21*8+0x7b] -// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [r9 + zmm21 + 256] +// CHECK: vpgatherqq zmm17 {k1}, qword ptr [r9 + zmm21 + 256] // CHECK: encoding: [0x62,0xc2,0xfd,0x41,0x91,0x4c,0x29,0x20] - vpgatherqq zmm17{k1},ZMMWORD PTR [r9+zmm21*1+0x100] + vpgatherqq zmm17{k1},QWORD PTR [r9+zmm21*1+0x100] -// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [rcx + 4*zmm21 + 1024] +// CHECK: vpgatherqq zmm17 {k1}, qword ptr [rcx + 4*zmm21 + 1024] // CHECK: encoding: [0x62,0xe2,0xfd,0x41,0x91,0x8c,0xa9,0x00,0x04,0x00,0x00] - vpgatherqq zmm17{k1},ZMMWORD PTR [rcx+zmm21*4+0x400] + vpgatherqq zmm17{k1},QWORD PTR [rcx+zmm21*4+0x400] -// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19 +// CHECK: vpscatterdd dword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00] - vpscatterdd ZMMWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19 + vpscatterdd DWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19 -// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19 +// CHECK: vpscatterdd dword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00] - vpscatterdd ZMMWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19 + vpscatterdd DWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19 -// CHECK: vpscatterdd zmmword ptr [r9 + zmm16 + 256] {k1}, zmm19 +// CHECK: vpscatterdd dword ptr [r9 + zmm16 + 256] {k1}, zmm19 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x5c,0x01,0x40] - vpscatterdd ZMMWORD PTR [r9+zmm16*1+0x100]{k1},zmm19 + vpscatterdd DWORD PTR [r9+zmm16*1+0x100]{k1},zmm19 -// CHECK: vpscatterdd zmmword ptr [rcx + 4*zmm16 + 1024] {k1}, zmm19 +// CHECK: vpscatterdd dword ptr [rcx + 4*zmm16 + 1024] {k1}, zmm19 // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0xa0,0x9c,0x81,0x00,0x04,0x00,0x00] - vpscatterdd ZMMWORD PTR [rcx+zmm16*4+0x400]{k1},zmm19 + vpscatterdd DWORD PTR [rcx+zmm16*4+0x400]{k1},zmm19 -// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5 +// CHECK: vpscatterdq qword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00] - vpscatterdq ZMMWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5 + vpscatterdq QWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5 -// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5 +// CHECK: vpscatterdq qword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00] - vpscatterdq ZMMWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5 + vpscatterdq QWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5 -// CHECK: vpscatterdq zmmword ptr [r9 + ymm6 + 256] {k1}, zmm5 +// CHECK: vpscatterdq qword ptr [r9 + ymm6 + 256] {k1}, zmm5 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0x6c,0x31,0x20] - vpscatterdq ZMMWORD PTR [r9+ymm6*1+0x100]{k1},zmm5 + vpscatterdq QWORD PTR [r9+ymm6*1+0x100]{k1},zmm5 -// CHECK: vpscatterdq zmmword ptr [rcx + 4*ymm6 + 1024] {k1}, zmm5 +// CHECK: vpscatterdq qword ptr [rcx + 4*ymm6 + 1024] {k1}, zmm5 // CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xa0,0xac,0xb1,0x00,0x04,0x00,0x00] - vpscatterdq ZMMWORD PTR [rcx+ymm6*4+0x400]{k1},zmm5 + vpscatterdq QWORD PTR [rcx+ymm6*4+0x400]{k1},zmm5 -// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20 +// CHECK: vpscatterqd dword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00] - vpscatterqd YMMWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20 + vpscatterqd DWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20 -// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20 +// CHECK: vpscatterqd dword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00] - vpscatterqd YMMWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20 + vpscatterqd DWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20 -// CHECK: vpscatterqd ymmword ptr [r9 + zmm2 + 256] {k1}, ymm20 +// CHECK: vpscatterqd dword ptr [r9 + zmm2 + 256] {k1}, ymm20 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0x64,0x11,0x40] - vpscatterqd YMMWORD PTR [r9+zmm2*1+0x100]{k1},ymm20 + vpscatterqd DWORD PTR [r9+zmm2*1+0x100]{k1},ymm20 -// CHECK: vpscatterqd ymmword ptr [rcx + 4*zmm2 + 1024] {k1}, ymm20 +// CHECK: vpscatterqd dword ptr [rcx + 4*zmm2 + 1024] {k1}, ymm20 // CHECK: encoding: [0x62,0xe2,0x7d,0x49,0xa1,0xa4,0x91,0x00,0x04,0x00,0x00] - vpscatterqd YMMWORD PTR [rcx+zmm2*4+0x400]{k1},ymm20 + vpscatterqd DWORD PTR [rcx+zmm2*4+0x400]{k1},ymm20 -// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14 +// CHECK: vpscatterqq qword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00] - vpscatterqq ZMMWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14 + vpscatterqq QWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14 -// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14 +// CHECK: vpscatterqq qword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00] - vpscatterqq ZMMWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14 + vpscatterqq QWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14 -// CHECK: vpscatterqq zmmword ptr [r9 + zmm20 + 256] {k1}, zmm14 +// CHECK: vpscatterqq qword ptr [r9 + zmm20 + 256] {k1}, zmm14 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0x74,0x21,0x20] - vpscatterqq ZMMWORD PTR [r9+zmm20*1+0x100]{k1},zmm14 + vpscatterqq QWORD PTR [r9+zmm20*1+0x100]{k1},zmm14 -// CHECK: vpscatterqq zmmword ptr [rcx + 4*zmm20 + 1024] {k1}, zmm14 +// CHECK: vpscatterqq qword ptr [rcx + 4*zmm20 + 1024] {k1}, zmm14 // CHECK: encoding: [0x62,0x72,0xfd,0x41,0xa1,0xb4,0xa1,0x00,0x04,0x00,0x00] - vpscatterqq ZMMWORD PTR [rcx+zmm20*4+0x400]{k1},zmm14 + vpscatterqq QWORD PTR [rcx+zmm20*4+0x400]{k1},zmm14 -// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xc6,0x7b,0x00,0x00,0x00] - vscatterdpd ZMMWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18 + vscatterdpd QWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xc6,0x7b,0x00,0x00,0x00] - vscatterdpd ZMMWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18 + vscatterdpd QWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [r9 + ymm24 + 256] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r9 + ymm24 + 256] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x54,0x01,0x20] - vscatterdpd ZMMWORD PTR [r9+ymm24*1+0x100]{k1},zmm18 + vscatterdpd QWORD PTR [r9+ymm24*1+0x100]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [rcx + 4*ymm24 + 1024] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [rcx + 4*ymm24 + 1024] {k1}, zmm18 // CHECK: encoding: [0x62,0xa2,0xfd,0x41,0xa2,0x94,0x81,0x00,0x04,0x00,0x00] - vscatterdpd ZMMWORD PTR [rcx+ymm24*4+0x400]{k1},zmm18 + vscatterdpd QWORD PTR [rcx+ymm24*4+0x400]{k1},zmm18 -// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17 +// CHECK: vscatterdps dword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x8c,0xde,0x7b,0x00,0x00,0x00] - vscatterdps ZMMWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17 + vscatterdps DWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17 -// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17 +// CHECK: vscatterdps dword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x8c,0xde,0x7b,0x00,0x00,0x00] - vscatterdps ZMMWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17 + vscatterdps DWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17 -// CHECK: vscatterdps zmmword ptr [r9 + zmm19 + 256] {k1}, zmm17 +// CHECK: vscatterdps dword ptr [r9 + zmm19 + 256] {k1}, zmm17 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x4c,0x19,0x40] - vscatterdps ZMMWORD PTR [r9+zmm19*1+0x100]{k1},zmm17 + vscatterdps DWORD PTR [r9+zmm19*1+0x100]{k1},zmm17 -// CHECK: vscatterdps zmmword ptr [rcx + 4*zmm19 + 1024] {k1}, zmm17 +// CHECK: vscatterdps dword ptr [rcx + 4*zmm19 + 1024] {k1}, zmm17 // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0xa2,0x8c,0x99,0x00,0x04,0x00,0x00] - vscatterdps ZMMWORD PTR [rcx+zmm19*4+0x400]{k1},zmm17 + vscatterdps DWORD PTR [rcx+zmm19*4+0x400]{k1},zmm17 -// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22 +// CHECK: vscatterqpd qword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa3,0xb4,0xe6,0x7b,0x00,0x00,0x00] - vscatterqpd ZMMWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22 + vscatterqpd QWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22 -// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22 +// CHECK: vscatterqpd qword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa3,0xb4,0xe6,0x7b,0x00,0x00,0x00] - vscatterqpd ZMMWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22 + vscatterqpd QWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22 -// CHECK: vscatterqpd zmmword ptr [r9 + zmm28 + 256] {k1}, zmm22 +// CHECK: vscatterqpd qword ptr [r9 + zmm28 + 256] {k1}, zmm22 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa3,0x74,0x21,0x20] - vscatterqpd ZMMWORD PTR [r9+zmm28*1+0x100]{k1},zmm22 + vscatterqpd QWORD PTR [r9+zmm28*1+0x100]{k1},zmm22 -// CHECK: vscatterqpd zmmword ptr [rcx + 4*zmm28 + 1024] {k1}, zmm22 +// CHECK: vscatterqpd qword ptr [rcx + 4*zmm28 + 1024] {k1}, zmm22 // CHECK: encoding: [0x62,0xa2,0xfd,0x41,0xa3,0xb4,0xa1,0x00,0x04,0x00,0x00] - vscatterqpd ZMMWORD PTR [rcx+zmm28*4+0x400]{k1},zmm22 + vscatterqpd QWORD PTR [rcx+zmm28*4+0x400]{k1},zmm22 -// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6 +// CHECK: vscatterqps dword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6 // CHECK: encoding: [0x62,0x92,0x7d,0x41,0xa3,0xb4,0xde,0x7b,0x00,0x00,0x00] - vscatterqps YMMWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6 + vscatterqps DWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6 -// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6 +// CHECK: vscatterqps dword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6 // CHECK: encoding: [0x62,0x92,0x7d,0x41,0xa3,0xb4,0xde,0x7b,0x00,0x00,0x00] - vscatterqps YMMWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6 + vscatterqps DWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6 -// CHECK: vscatterqps ymmword ptr [r9 + zmm27 + 256] {k1}, ymm6 +// CHECK: vscatterqps dword ptr [r9 + zmm27 + 256] {k1}, ymm6 // CHECK: encoding: [0x62,0x92,0x7d,0x41,0xa3,0x74,0x19,0x40] - vscatterqps YMMWORD PTR [r9+zmm27*1+0x100]{k1},ymm6 + vscatterqps DWORD PTR [r9+zmm27*1+0x100]{k1},ymm6 -// CHECK: vscatterqps ymmword ptr [rcx + 4*zmm27 + 1024] {k1}, ymm6 +// CHECK: vscatterqps dword ptr [rcx + 4*zmm27 + 1024] {k1}, ymm6 // CHECK: encoding: [0x62,0xb2,0x7d,0x41,0xa3,0xb4,0x99,0x00,0x04,0x00,0x00] - vscatterqps YMMWORD PTR [rcx+zmm27*4+0x400]{k1},ymm6 + vscatterqps DWORD PTR [rcx+zmm27*4+0x400]{k1},ymm6 -// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xde,0x85,0xff,0xff,0xff] - vscatterdpd ZMMWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18 + vscatterdpd QWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xde,0x85,0xff,0xff,0xff] - vscatterdpd ZMMWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18 + vscatterdpd QWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [r9 + ymm27 + 256] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r9 + ymm27 + 256] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x54,0x19,0x20] - vscatterdpd ZMMWORD PTR [r9+ymm27*1+0x100]{k1},zmm18 + vscatterdpd QWORD PTR [r9+ymm27*1+0x100]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [rcx + 4*ymm27 + 1024] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [rcx + 4*ymm27 + 1024] {k1}, zmm18 // CHECK: encoding: [0x62,0xa2,0xfd,0x41,0xa2,0x94,0x99,0x00,0x04,0x00,0x00] - vscatterdpd ZMMWORD PTR [rcx+ymm27*4+0x400]{k1},zmm18 + vscatterdpd QWORD PTR [rcx+ymm27*4+0x400]{k1},zmm18 -// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1 +// CHECK: vscatterdps dword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1 // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x8c,0xce,0x85,0xff,0xff,0xff] - vscatterdps ZMMWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1 + vscatterdps DWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1 -// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1 +// CHECK: vscatterdps dword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1 // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x8c,0xce,0x85,0xff,0xff,0xff] - vscatterdps ZMMWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1 + vscatterdps DWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1 -// CHECK: vscatterdps zmmword ptr [r9 + zmm17 + 256] {k1}, zmm1 +// CHECK: vscatterdps dword ptr [r9 + zmm17 + 256] {k1}, zmm1 // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x4c,0x09,0x40] - vscatterdps ZMMWORD PTR [r9+zmm17*1+0x100]{k1},zmm1 + vscatterdps DWORD PTR [r9+zmm17*1+0x100]{k1},zmm1 -// CHECK: vscatterdps zmmword ptr [rcx + 4*zmm17 + 1024] {k1}, zmm1 +// CHECK: vscatterdps dword ptr [rcx + 4*zmm17 + 1024] {k1}, zmm1 // CHECK: encoding: [0x62,0xf2,0x7d,0x41,0xa2,0x8c,0x89,0x00,0x04,0x00,0x00] - vscatterdps ZMMWORD PTR [rcx+zmm17*4+0x400]{k1},zmm1 + vscatterdps DWORD PTR [rcx+zmm17*4+0x400]{k1},zmm1 -// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8 +// CHECK: vscatterqpd qword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8 // CHECK: encoding: [0x62,0x12,0xfd,0x41,0xa3,0x84,0xce,0x85,0xff,0xff,0xff] - vscatterqpd ZMMWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8 + vscatterqpd QWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8 -// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8 +// CHECK: vscatterqpd qword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8 // CHECK: encoding: [0x62,0x12,0xfd,0x41,0xa3,0x84,0xce,0x85,0xff,0xff,0xff] - vscatterqpd ZMMWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8 + vscatterqpd QWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8 -// CHECK: vscatterqpd zmmword ptr [r9 + zmm25 + 256] {k1}, zmm8 +// CHECK: vscatterqpd qword ptr [r9 + zmm25 + 256] {k1}, zmm8 // CHECK: encoding: [0x62,0x12,0xfd,0x41,0xa3,0x44,0x09,0x20] - vscatterqpd ZMMWORD PTR [r9+zmm25*1+0x100]{k1},zmm8 + vscatterqpd QWORD PTR [r9+zmm25*1+0x100]{k1},zmm8 -// CHECK: vscatterqpd zmmword ptr [rcx + 4*zmm25 + 1024] {k1}, zmm8 +// CHECK: vscatterqpd qword ptr [rcx + 4*zmm25 + 1024] {k1}, zmm8 // CHECK: encoding: [0x62,0x32,0xfd,0x41,0xa3,0x84,0x89,0x00,0x04,0x00,0x00] - vscatterqpd ZMMWORD PTR [rcx+zmm25*4+0x400]{k1},zmm8 + vscatterqpd QWORD PTR [rcx+zmm25*4+0x400]{k1},zmm8 -// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13 +// CHECK: vscatterqps dword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13 // CHECK: encoding: [0x62,0x12,0x7d,0x49,0xa3,0xac,0xd6,0x85,0xff,0xff,0xff] - vscatterqps YMMWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13 + vscatterqps DWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13 -// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13 +// CHECK: vscatterqps dword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13 // CHECK: encoding: [0x62,0x12,0x7d,0x49,0xa3,0xac,0xd6,0x85,0xff,0xff,0xff] - vscatterqps YMMWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13 + vscatterqps DWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13 -// CHECK: vscatterqps ymmword ptr [r9 + zmm10 + 256] {k1}, ymm13 +// CHECK: vscatterqps dword ptr [r9 + zmm10 + 256] {k1}, ymm13 // CHECK: encoding: [0x62,0x12,0x7d,0x49,0xa3,0x6c,0x11,0x40] - vscatterqps YMMWORD PTR [r9+zmm10*1+0x100]{k1},ymm13 + vscatterqps DWORD PTR [r9+zmm10*1+0x100]{k1},ymm13 -// CHECK: vscatterqps ymmword ptr [rcx + 4*zmm10 + 1024] {k1}, ymm13 +// CHECK: vscatterqps dword ptr [rcx + 4*zmm10 + 1024] {k1}, ymm13 // CHECK: encoding: [0x62,0x32,0x7d,0x49,0xa3,0xac,0x91,0x00,0x04,0x00,0x00] - vscatterqps YMMWORD PTR [rcx+zmm10*4+0x400]{k1},ymm13 + vscatterqps DWORD PTR [rcx+zmm10*4+0x400]{k1},ymm13 -// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [r14 + 8*ymm5 - 123] +// CHECK: vgatherdpd zmm30 {k1}, qword ptr [r14 + 8*ymm5 - 123] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x92,0xb4,0xee,0x85,0xff,0xff,0xff] - vgatherdpd zmm30{k1},ZMMWORD PTR [r14+ymm5*8-0x7b] + vgatherdpd zmm30{k1},QWORD PTR [r14+ymm5*8-0x7b] -// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [r9 + ymm5 + 256] +// CHECK: vgatherdpd zmm30 {k1}, qword ptr [r9 + ymm5 + 256] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x92,0x74,0x29,0x20] - vgatherdpd zmm30{k1},ZMMWORD PTR [r9+ymm5*1+0x100] + vgatherdpd zmm30{k1},QWORD PTR [r9+ymm5*1+0x100] -// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [rcx + 4*ymm5 + 1024] +// CHECK: vgatherdpd zmm30 {k1}, qword ptr [rcx + 4*ymm5 + 1024] // CHECK: encoding: [0x62,0x62,0xfd,0x49,0x92,0xb4,0xa9,0x00,0x04,0x00,0x00] - vgatherdpd zmm30{k1},ZMMWORD PTR [rcx+ymm5*4+0x400] + vgatherdpd zmm30{k1},QWORD PTR [rcx+ymm5*4+0x400] -// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [r14 + 8*zmm26 - 123] +// CHECK: vgatherdps zmm8 {k1}, dword ptr [r14 + 8*zmm26 - 123] // CHECK: encoding: [0x62,0x12,0x7d,0x41,0x92,0x84,0xd6,0x85,0xff,0xff,0xff] - vgatherdps zmm8{k1},ZMMWORD PTR [r14+zmm26*8-0x7b] + vgatherdps zmm8{k1},DWORD PTR [r14+zmm26*8-0x7b] -// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [r9 + zmm26 + 256] +// CHECK: vgatherdps zmm8 {k1}, dword ptr [r9 + zmm26 + 256] // CHECK: encoding: [0x62,0x12,0x7d,0x41,0x92,0x44,0x11,0x40] - vgatherdps zmm8{k1},ZMMWORD PTR [r9+zmm26*1+0x100] + vgatherdps zmm8{k1},DWORD PTR [r9+zmm26*1+0x100] -// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [rcx + 4*zmm26 + 1024] +// CHECK: vgatherdps zmm8 {k1}, dword ptr [rcx + 4*zmm26 + 1024] // CHECK: encoding: [0x62,0x32,0x7d,0x41,0x92,0x84,0x91,0x00,0x04,0x00,0x00] - vgatherdps zmm8{k1},ZMMWORD PTR [rcx+zmm26*4+0x400] + vgatherdps zmm8{k1},DWORD PTR [rcx+zmm26*4+0x400] -// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [r14 + 8*zmm13 - 123] +// CHECK: vgatherqpd zmm27 {k1}, qword ptr [r14 + 8*zmm13 - 123] // CHECK: encoding: [0x62,0x02,0xfd,0x49,0x93,0x9c,0xee,0x85,0xff,0xff,0xff] - vgatherqpd zmm27{k1},ZMMWORD PTR [r14+zmm13*8-0x7b] + vgatherqpd zmm27{k1},QWORD PTR [r14+zmm13*8-0x7b] -// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [r9 + zmm13 + 256] +// CHECK: vgatherqpd zmm27 {k1}, qword ptr [r9 + zmm13 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x49,0x93,0x5c,0x29,0x20] - vgatherqpd zmm27{k1},ZMMWORD PTR [r9+zmm13*1+0x100] + vgatherqpd zmm27{k1},QWORD PTR [r9+zmm13*1+0x100] -// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [rcx + 4*zmm13 + 1024] +// CHECK: vgatherqpd zmm27 {k1}, qword ptr [rcx + 4*zmm13 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x49,0x93,0x9c,0xa9,0x00,0x04,0x00,0x00] - vgatherqpd zmm27{k1},ZMMWORD PTR [rcx+zmm13*4+0x400] + vgatherqpd zmm27{k1},QWORD PTR [rcx+zmm13*4+0x400] -// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [r14 + 8*zmm14 - 123] +// CHECK: vgatherqps ymm27 {k1}, dword ptr [r14 + 8*zmm14 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x49,0x93,0x9c,0xf6,0x85,0xff,0xff,0xff] - vgatherqps ymm27{k1},YMMWORD PTR [r14+zmm14*8-0x7b] + vgatherqps ymm27{k1},DWORD PTR [r14+zmm14*8-0x7b] -// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [r9 + zmm14 + 256] +// CHECK: vgatherqps ymm27 {k1}, dword ptr [r9 + zmm14 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x49,0x93,0x5c,0x31,0x40] - vgatherqps ymm27{k1},YMMWORD PTR [r9+zmm14*1+0x100] + vgatherqps ymm27{k1},DWORD PTR [r9+zmm14*1+0x100] -// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [rcx + 4*zmm14 + 1024] +// CHECK: vgatherqps ymm27 {k1}, dword ptr [rcx + 4*zmm14 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x49,0x93,0x9c,0xb1,0x00,0x04,0x00,0x00] - vgatherqps ymm27{k1},YMMWORD PTR [rcx+zmm14*4+0x400] + vgatherqps ymm27{k1},DWORD PTR [rcx+zmm14*4+0x400] -// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [r14 + 8*zmm16 - 123] +// CHECK: vpgatherdd zmm7 {k1}, dword ptr [r14 + 8*zmm16 - 123] // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x90,0xbc,0xc6,0x85,0xff,0xff,0xff] - vpgatherdd zmm7{k1},ZMMWORD PTR [r14+zmm16*8-0x7b] + vpgatherdd zmm7{k1},DWORD PTR [r14+zmm16*8-0x7b] -// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [r9 + zmm16 + 256] +// CHECK: vpgatherdd zmm7 {k1}, dword ptr [r9 + zmm16 + 256] // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x90,0x7c,0x01,0x40] - vpgatherdd zmm7{k1},ZMMWORD PTR [r9+zmm16*1+0x100] + vpgatherdd zmm7{k1},DWORD PTR [r9+zmm16*1+0x100] -// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [rcx + 4*zmm16 + 1024] +// CHECK: vpgatherdd zmm7 {k1}, dword ptr [rcx + 4*zmm16 + 1024] // CHECK: encoding: [0x62,0xf2,0x7d,0x41,0x90,0xbc,0x81,0x00,0x04,0x00,0x00] - vpgatherdd zmm7{k1},ZMMWORD PTR [rcx+zmm16*4+0x400] + vpgatherdd zmm7{k1},DWORD PTR [rcx+zmm16*4+0x400] -// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [r14 + 8*ymm7 - 123] +// CHECK: vpgatherdq zmm25 {k1}, qword ptr [r14 + 8*ymm7 - 123] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x90,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpgatherdq zmm25{k1},ZMMWORD PTR [r14+ymm7*8-0x7b] + vpgatherdq zmm25{k1},QWORD PTR [r14+ymm7*8-0x7b] -// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [r9 + ymm7 + 256] +// CHECK: vpgatherdq zmm25 {k1}, qword ptr [r9 + ymm7 + 256] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x90,0x4c,0x39,0x20] - vpgatherdq zmm25{k1},ZMMWORD PTR [r9+ymm7*1+0x100] + vpgatherdq zmm25{k1},QWORD PTR [r9+ymm7*1+0x100] -// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [rcx + 4*ymm7 + 1024] +// CHECK: vpgatherdq zmm25 {k1}, qword ptr [rcx + 4*ymm7 + 1024] // CHECK: encoding: [0x62,0x62,0xfd,0x49,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq zmm25{k1},ZMMWORD PTR [rcx+ymm7*4+0x400] + vpgatherdq zmm25{k1},QWORD PTR [rcx+ymm7*4+0x400] -// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [r14 + 8*zmm17 - 123] +// CHECK: vpgatherqd ymm19 {k1}, dword ptr [r14 + 8*zmm17 - 123] // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0x91,0x9c,0xce,0x85,0xff,0xff,0xff] - vpgatherqd ymm19{k1},YMMWORD PTR [r14+zmm17*8-0x7b] + vpgatherqd ymm19{k1},DWORD PTR [r14+zmm17*8-0x7b] -// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [r9 + zmm17 + 256] +// CHECK: vpgatherqd ymm19 {k1}, dword ptr [r9 + zmm17 + 256] // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0x91,0x5c,0x09,0x40] - vpgatherqd ymm19{k1},YMMWORD PTR [r9+zmm17*1+0x100] + vpgatherqd ymm19{k1},DWORD PTR [r9+zmm17*1+0x100] -// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [rcx + 4*zmm17 + 1024] +// CHECK: vpgatherqd ymm19 {k1}, dword ptr [rcx + 4*zmm17 + 1024] // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0x91,0x9c,0x89,0x00,0x04,0x00,0x00] - vpgatherqd ymm19{k1},YMMWORD PTR [rcx+zmm17*4+0x400] + vpgatherqd ymm19{k1},DWORD PTR [rcx+zmm17*4+0x400] -// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [r14 + 8*zmm13 - 123] +// CHECK: vpgatherqq zmm10 {k1}, qword ptr [r14 + 8*zmm13 - 123] // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x91,0x94,0xee,0x85,0xff,0xff,0xff] - vpgatherqq zmm10{k1},ZMMWORD PTR [r14+zmm13*8-0x7b] + vpgatherqq zmm10{k1},QWORD PTR [r14+zmm13*8-0x7b] -// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [r9 + zmm13 + 256] +// CHECK: vpgatherqq zmm10 {k1}, qword ptr [r9 + zmm13 + 256] // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x91,0x54,0x29,0x20] - vpgatherqq zmm10{k1},ZMMWORD PTR [r9+zmm13*1+0x100] + vpgatherqq zmm10{k1},QWORD PTR [r9+zmm13*1+0x100] -// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [rcx + 4*zmm13 + 1024] +// CHECK: vpgatherqq zmm10 {k1}, qword ptr [rcx + 4*zmm13 + 1024] // CHECK: encoding: [0x62,0x32,0xfd,0x49,0x91,0x94,0xa9,0x00,0x04,0x00,0x00] - vpgatherqq zmm10{k1},ZMMWORD PTR [rcx+zmm13*4+0x400] + vpgatherqq zmm10{k1},QWORD PTR [rcx+zmm13*4+0x400] -// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23 +// CHECK: vpscatterdd dword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa0,0xbc,0xe6,0x85,0xff,0xff,0xff] - vpscatterdd ZMMWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23 + vpscatterdd DWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23 -// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23 +// CHECK: vpscatterdd dword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa0,0xbc,0xe6,0x85,0xff,0xff,0xff] - vpscatterdd ZMMWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23 + vpscatterdd DWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23 -// CHECK: vpscatterdd zmmword ptr [r9 + zmm4 + 256] {k1}, zmm23 +// CHECK: vpscatterdd dword ptr [r9 + zmm4 + 256] {k1}, zmm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa0,0x7c,0x21,0x40] - vpscatterdd ZMMWORD PTR [r9+zmm4*1+0x100]{k1},zmm23 + vpscatterdd DWORD PTR [r9+zmm4*1+0x100]{k1},zmm23 -// CHECK: vpscatterdd zmmword ptr [rcx + 4*zmm4 + 1024] {k1}, zmm23 +// CHECK: vpscatterdd dword ptr [rcx + 4*zmm4 + 1024] {k1}, zmm23 // CHECK: encoding: [0x62,0xe2,0x7d,0x49,0xa0,0xbc,0xa1,0x00,0x04,0x00,0x00] - vpscatterdd ZMMWORD PTR [rcx+zmm4*4+0x400]{k1},zmm23 + vpscatterdd DWORD PTR [rcx+zmm4*4+0x400]{k1},zmm23 -// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1 +// CHECK: vpscatterdq qword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1 // CHECK: encoding: [0x62,0x92,0xfd,0x41,0xa0,0x8c,0xce,0x85,0xff,0xff,0xff] - vpscatterdq ZMMWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1 + vpscatterdq QWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1 -// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1 +// CHECK: vpscatterdq qword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1 // CHECK: encoding: [0x62,0x92,0xfd,0x41,0xa0,0x8c,0xce,0x85,0xff,0xff,0xff] - vpscatterdq ZMMWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1 + vpscatterdq QWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1 -// CHECK: vpscatterdq zmmword ptr [r9 + ymm25 + 256] {k1}, zmm1 +// CHECK: vpscatterdq qword ptr [r9 + ymm25 + 256] {k1}, zmm1 // CHECK: encoding: [0x62,0x92,0xfd,0x41,0xa0,0x4c,0x09,0x20] - vpscatterdq ZMMWORD PTR [r9+ymm25*1+0x100]{k1},zmm1 + vpscatterdq QWORD PTR [r9+ymm25*1+0x100]{k1},zmm1 -// CHECK: vpscatterdq zmmword ptr [rcx + 4*ymm25 + 1024] {k1}, zmm1 +// CHECK: vpscatterdq qword ptr [rcx + 4*ymm25 + 1024] {k1}, zmm1 // CHECK: encoding: [0x62,0xb2,0xfd,0x41,0xa0,0x8c,0x89,0x00,0x04,0x00,0x00] - vpscatterdq ZMMWORD PTR [rcx+ymm25*4+0x400]{k1},zmm1 + vpscatterdq QWORD PTR [rcx+ymm25*4+0x400]{k1},zmm1 -// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23 +// CHECK: vpscatterqd dword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa1,0xbc,0xf6,0x85,0xff,0xff,0xff] - vpscatterqd YMMWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23 + vpscatterqd DWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23 -// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23 +// CHECK: vpscatterqd dword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa1,0xbc,0xf6,0x85,0xff,0xff,0xff] - vpscatterqd YMMWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23 + vpscatterqd DWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23 -// CHECK: vpscatterqd ymmword ptr [r9 + zmm22 + 256] {k1}, ymm23 +// CHECK: vpscatterqd dword ptr [r9 + zmm22 + 256] {k1}, ymm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa1,0x7c,0x31,0x40] - vpscatterqd YMMWORD PTR [r9+zmm22*1+0x100]{k1},ymm23 + vpscatterqd DWORD PTR [r9+zmm22*1+0x100]{k1},ymm23 -// CHECK: vpscatterqd ymmword ptr [rcx + 4*zmm22 + 1024] {k1}, ymm23 +// CHECK: vpscatterqd dword ptr [rcx + 4*zmm22 + 1024] {k1}, ymm23 // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0xa1,0xbc,0xb1,0x00,0x04,0x00,0x00] - vpscatterqd YMMWORD PTR [rcx+zmm22*4+0x400]{k1},ymm23 + vpscatterqd DWORD PTR [rcx+zmm22*4+0x400]{k1},ymm23 -// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2 +// CHECK: vpscatterqq qword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2 // CHECK: encoding: [0x62,0x92,0xfd,0x49,0xa1,0x94,0xc6,0x85,0xff,0xff,0xff] - vpscatterqq ZMMWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2 + vpscatterqq QWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2 -// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2 +// CHECK: vpscatterqq qword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2 // CHECK: encoding: [0x62,0x92,0xfd,0x49,0xa1,0x94,0xc6,0x85,0xff,0xff,0xff] - vpscatterqq ZMMWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2 + vpscatterqq QWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2 -// CHECK: vpscatterqq zmmword ptr [r9 + zmm8 + 256] {k1}, zmm2 +// CHECK: vpscatterqq qword ptr [r9 + zmm8 + 256] {k1}, zmm2 // CHECK: encoding: [0x62,0x92,0xfd,0x49,0xa1,0x54,0x01,0x20] - vpscatterqq ZMMWORD PTR [r9+zmm8*1+0x100]{k1},zmm2 + vpscatterqq QWORD PTR [r9+zmm8*1+0x100]{k1},zmm2 -// CHECK: vpscatterqq zmmword ptr [rcx + 4*zmm8 + 1024] {k1}, zmm2 +// CHECK: vpscatterqq qword ptr [rcx + 4*zmm8 + 1024] {k1}, zmm2 // CHECK: encoding: [0x62,0xb2,0xfd,0x49,0xa1,0x94,0x81,0x00,0x04,0x00,0x00] - vpscatterqq ZMMWORD PTR [rcx+zmm8*4+0x400]{k1},zmm2 + vpscatterqq QWORD PTR [rcx+zmm8*4+0x400]{k1},zmm2 diff --git a/llvm/test/MC/X86/avx512f_vl-intel.s b/llvm/test/MC/X86/avx512f_vl-intel.s index 31c43afe50171..ed3292b83f4d7 100644 --- a/llvm/test/MC/X86/avx512f_vl-intel.s +++ b/llvm/test/MC/X86/avx512f_vl-intel.s @@ -224,901 +224,901 @@ // CHECK: encoding: [0x62,0xf1,0x64,0x30,0xc2,0xa2,0xfc,0xfd,0xff,0xff,0x7b] vcmpps k4,ymm19,DWORD PTR [rdx-0x204]{1to8},0x7b -// CHECK: vgatherdpd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherdpd xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vgatherdpd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vgatherdpd xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherdpd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdpd xmm17 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x4c,0x39,0x20] - vgatherdpd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherdpd xmm17 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdpd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdpd xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x92,0x8c,0xb9,0x00,0x04,0x00,0x00] - vgatherdpd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherdpd xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdpd ymm23 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherdpd ymm23 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vgatherdpd ymm23 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] + vgatherdpd ymm23 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherdpd ymm23 {k1}, ymmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdpd ymm23 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x7c,0x39,0x20] - vgatherdpd ymm23 {k1}, ymmword ptr [r9 + xmm31 + 256] + vgatherdpd ymm23 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdpd ymm23 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdpd ymm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x92,0xbc,0xb9,0x00,0x04,0x00,0x00] - vgatherdpd ymm23 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] + vgatherdpd ymm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdpd xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherdpd xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0xbc,0xfe,0x85,0xff,0xff,0xff] - vgatherdpd xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vgatherdpd xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherdpd xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdpd xmm23 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x7c,0x39,0x20] - vgatherdpd xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherdpd xmm23 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdpd xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdpd xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x92,0xbc,0xb9,0x00,0x04,0x00,0x00] - vgatherdpd xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherdpd xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdpd ymm18 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherdpd ymm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x94,0xfe,0x85,0xff,0xff,0xff] - vgatherdpd ymm18 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] + vgatherdpd ymm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherdpd ymm18 {k1}, ymmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdpd ymm18 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x54,0x39,0x20] - vgatherdpd ymm18 {k1}, ymmword ptr [r9 + xmm31 + 256] + vgatherdpd ymm18 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdpd ymm18 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdpd ymm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x92,0x94,0xb9,0x00,0x04,0x00,0x00] - vgatherdpd ymm18 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] + vgatherdpd ymm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdps xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherdps xmm18 {k1}, dword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x94,0xfe,0x7b,0x00,0x00,0x00] - vgatherdps xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vgatherdps xmm18 {k1}, dword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherdps xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdps xmm18 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x54,0x39,0x40] - vgatherdps xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherdps xmm18 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdps xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdps xmm18 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x92,0x94,0xb9,0x00,0x04,0x00,0x00] - vgatherdps xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherdps xmm18 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdps ymm27 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vgatherdps ymm27 {k1}, dword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vgatherdps ymm27 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] + vgatherdps ymm27 {k1}, dword ptr [r14 + 8*ymm31 + 123] -// CHECK: vgatherdps ymm27 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherdps ymm27 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x5c,0x39,0x40] - vgatherdps ymm27 {k1}, ymmword ptr [r9 + ymm31 + 256] + vgatherdps ymm27 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vgatherdps ymm27 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherdps ymm27 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x92,0x9c,0xb9,0x00,0x04,0x00,0x00] - vgatherdps ymm27 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vgatherdps ymm27 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherdps xmm29 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherdps xmm29 {k1}, dword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x92,0xac,0xfe,0x85,0xff,0xff,0xff] - vgatherdps xmm29 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vgatherdps xmm29 {k1}, dword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherdps xmm29 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdps xmm29 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x92,0x6c,0x39,0x40] - vgatherdps xmm29 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherdps xmm29 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdps xmm29 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdps xmm29 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x01,0x92,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherdps xmm29 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherdps xmm29 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdps ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vgatherdps ymm21 {k1}, dword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x92,0xac,0xfe,0x85,0xff,0xff,0xff] - vgatherdps ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] + vgatherdps ymm21 {k1}, dword ptr [r14 + 8*ymm31 - 123] -// CHECK: vgatherdps ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherdps ymm21 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x92,0x6c,0x39,0x40] - vgatherdps ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] + vgatherdps ymm21 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vgatherdps ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherdps ymm21 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x92,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherdps ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vgatherdps ymm21 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherqpd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherqpd xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vgatherqpd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vgatherqpd xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherqpd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherqpd xmm17 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x4c,0x39,0x20] - vgatherqpd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherqpd xmm17 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherqpd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherqpd xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x93,0x8c,0xb9,0x00,0x04,0x00,0x00] - vgatherqpd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherqpd xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherqpd ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vgatherqpd ymm29 {k1}, qword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00] - vgatherqpd ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] + vgatherqpd ymm29 {k1}, qword ptr [r14 + 8*ymm31 + 123] -// CHECK: vgatherqpd ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherqpd ymm29 {k1}, qword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0x6c,0x39,0x20] - vgatherqpd ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] + vgatherqpd ymm29 {k1}, qword ptr [r9 + ymm31 + 256] -// CHECK: vgatherqpd ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherqpd ymm29 {k1}, qword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x93,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherqpd ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vgatherqpd ymm29 {k1}, qword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherqpd xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherqpd xmm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x94,0xfe,0x85,0xff,0xff,0xff] - vgatherqpd xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vgatherqpd xmm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherqpd xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherqpd xmm18 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x54,0x39,0x20] - vgatherqpd xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherqpd xmm18 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherqpd xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherqpd xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x93,0x94,0xb9,0x00,0x04,0x00,0x00] - vgatherqpd xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherqpd xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherqpd ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vgatherqpd ymm21 {k1}, qword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x93,0xac,0xfe,0x85,0xff,0xff,0xff] - vgatherqpd ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] + vgatherqpd ymm21 {k1}, qword ptr [r14 + 8*ymm31 - 123] -// CHECK: vgatherqpd ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherqpd ymm21 {k1}, qword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x93,0x6c,0x39,0x20] - vgatherqpd ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] + vgatherqpd ymm21 {k1}, qword ptr [r9 + ymm31 + 256] -// CHECK: vgatherqpd ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherqpd ymm21 {k1}, qword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x93,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherqpd ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vgatherqpd ymm21 {k1}, qword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherqps xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherqps xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00] - vgatherqps xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] + vgatherqps xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherqps xmm21 {k1}, qword ptr [r9 + xmm31 + 256] +// CHECK: vgatherqps xmm21 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0x6c,0x39,0x40] - vgatherqps xmm21 {k1}, qword ptr [r9 + xmm31 + 256] + vgatherqps xmm21 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vgatherqps xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherqps xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x93,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherqps xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] + vgatherqps xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherqps xmm19 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vgatherqps xmm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vgatherqps xmm19 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] + vgatherqps xmm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] -// CHECK: vgatherqps xmm19 {k1}, xmmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherqps xmm19 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x5c,0x39,0x40] - vgatherqps xmm19 {k1}, xmmword ptr [r9 + ymm31 + 256] + vgatherqps xmm19 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vgatherqps xmm19 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherqps xmm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x93,0x9c,0xb9,0x00,0x04,0x00,0x00] - vgatherqps xmm19 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] + vgatherqps xmm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherqps xmm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherqps xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0xb4,0xfe,0x85,0xff,0xff,0xff] - vgatherqps xmm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] + vgatherqps xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherqps xmm22 {k1}, qword ptr [r9 + xmm31 + 256] +// CHECK: vgatherqps xmm22 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0x74,0x39,0x40] - vgatherqps xmm22 {k1}, qword ptr [r9 + xmm31 + 256] + vgatherqps xmm22 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vgatherqps xmm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherqps xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x93,0xb4,0xb9,0x00,0x04,0x00,0x00] - vgatherqps xmm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] + vgatherqps xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherqps xmm30 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vgatherqps xmm30 {k1}, dword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x93,0xb4,0xfe,0x85,0xff,0xff,0xff] - vgatherqps xmm30 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] + vgatherqps xmm30 {k1}, dword ptr [r14 + 8*ymm31 - 123] -// CHECK: vgatherqps xmm30 {k1}, xmmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherqps xmm30 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x93,0x74,0x39,0x40] - vgatherqps xmm30 {k1}, xmmword ptr [r9 + ymm31 + 256] + vgatherqps xmm30 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vgatherqps xmm30 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherqps xmm30 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x93,0xb4,0xb9,0x00,0x04,0x00,0x00] - vgatherqps xmm30 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] + vgatherqps xmm30 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherdd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherdd xmm17 {k1}, dword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherdd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vpgatherdd xmm17 {k1}, dword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherdd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdd xmm17 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x4c,0x39,0x40] - vpgatherdd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherdd xmm17 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdd xmm17 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdd xmm17 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdd ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vpgatherdd ymm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherdd ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] + vpgatherdd ymm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] -// CHECK: vpgatherdd ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherdd ymm19 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x5c,0x39,0x40] - vpgatherdd ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] + vpgatherdd ymm19 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherdd ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherdd ymm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x90,0x9c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdd ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vpgatherdd ymm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherdd xmm22 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherdd xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpgatherdd xmm22 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vpgatherdd xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherdd xmm22 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdd xmm22 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x74,0x39,0x40] - vpgatherdd xmm22 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherdd xmm22 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdd xmm22 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdd xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x90,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpgatherdd xmm22 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdd xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdd ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vpgatherdd ymm29 {k1}, dword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x90,0xac,0xfe,0x85,0xff,0xff,0xff] - vpgatherdd ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] + vpgatherdd ymm29 {k1}, dword ptr [r14 + 8*ymm31 - 123] -// CHECK: vpgatherdd ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherdd ymm29 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x90,0x6c,0x39,0x40] - vpgatherdd ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] + vpgatherdd ymm29 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherdd ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherdd ymm29 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x90,0xac,0xb9,0x00,0x04,0x00,0x00] - vpgatherdd ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vpgatherdd ymm29 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherdq xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherdq xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherdq xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vpgatherdq xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherdq xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdq xmm17 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x4c,0x39,0x20] - vpgatherdq xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherdq xmm17 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdq xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdq xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdq xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdq ymm26 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherdq ymm26 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x94,0xfe,0x7b,0x00,0x00,0x00] - vpgatherdq ymm26 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] + vpgatherdq ymm26 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherdq ymm26 {k1}, ymmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdq ymm26 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x54,0x39,0x20] - vpgatherdq ymm26 {k1}, ymmword ptr [r9 + xmm31 + 256] + vpgatherdq ymm26 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdq ymm26 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdq ymm26 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x90,0x94,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq ymm26 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdq ymm26 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdq xmm25 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherdq xmm25 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x02,0xfd,0x01,0x90,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpgatherdq xmm25 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vpgatherdq xmm25 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherdq xmm25 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdq xmm25 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x01,0x90,0x4c,0x39,0x20] - vpgatherdq xmm25 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherdq xmm25 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdq xmm25 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdq xmm25 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq xmm25 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdq xmm25 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdq ymm22 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherdq ymm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x90,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpgatherdq ymm22 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] + vpgatherdq ymm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherdq ymm22 {k1}, ymmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdq ymm22 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x90,0x74,0x39,0x20] - vpgatherdq ymm22 {k1}, ymmword ptr [r9 + xmm31 + 256] + vpgatherdq ymm22 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdq ymm22 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdq ymm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x90,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq ymm22 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdq ymm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqd xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherqd xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0xac,0xfe,0x7b,0x00,0x00,0x00] - vpgatherqd xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] + vpgatherqd xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherqd xmm21 {k1}, qword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherqd xmm21 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0x6c,0x39,0x40] - vpgatherqd xmm21 {k1}, qword ptr [r9 + xmm31 + 256] + vpgatherqd xmm21 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherqd xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherqd xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x91,0xac,0xb9,0x00,0x04,0x00,0x00] - vpgatherqd xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] + vpgatherqd xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqd xmm25 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vpgatherqd xmm25 {k1}, dword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherqd xmm25 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] + vpgatherqd xmm25 {k1}, dword ptr [r14 + 8*ymm31 + 123] -// CHECK: vpgatherqd xmm25 {k1}, xmmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherqd xmm25 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x4c,0x39,0x40] - vpgatherqd xmm25 {k1}, xmmword ptr [r9 + ymm31 + 256] + vpgatherqd xmm25 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherqd xmm25 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherqd xmm25 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x91,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherqd xmm25 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] + vpgatherqd xmm25 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherqd xmm30 {k1}, qword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherqd xmm30 {k1}, dword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x91,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpgatherqd xmm30 {k1}, qword ptr [r14 + 8*xmm31 - 123] + vpgatherqd xmm30 {k1}, dword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherqd xmm30 {k1}, qword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherqd xmm30 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x91,0x74,0x39,0x40] - vpgatherqd xmm30 {k1}, qword ptr [r9 + xmm31 + 256] + vpgatherqd xmm30 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherqd xmm30 {k1}, qword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherqd xmm30 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x01,0x91,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpgatherqd xmm30 {k1}, qword ptr [rcx + 4*xmm31 + 1024] + vpgatherqd xmm30 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqd xmm28 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vpgatherqd xmm28 {k1}, dword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpgatherqd xmm28 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] + vpgatherqd xmm28 {k1}, dword ptr [r14 + 8*ymm31 - 123] -// CHECK: vpgatherqd xmm28 {k1}, xmmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherqd xmm28 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x64,0x39,0x40] - vpgatherqd xmm28 {k1}, xmmword ptr [r9 + ymm31 + 256] + vpgatherqd xmm28 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherqd xmm28 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherqd xmm28 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x91,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpgatherqd xmm28 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] + vpgatherqd xmm28 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherqq xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherqq xmm18 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x94,0xfe,0x7b,0x00,0x00,0x00] - vpgatherqq xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vpgatherqq xmm18 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherqq xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherqq xmm18 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x54,0x39,0x20] - vpgatherqq xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherqq xmm18 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherqq xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherqq xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x91,0x94,0xb9,0x00,0x04,0x00,0x00] - vpgatherqq xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherqq xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqq ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vpgatherqq ymm19 {k1}, qword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherqq ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] + vpgatherqq ymm19 {k1}, qword ptr [r14 + 8*ymm31 + 123] -// CHECK: vpgatherqq ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherqq ymm19 {k1}, qword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x5c,0x39,0x20] - vpgatherqq ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] + vpgatherqq ymm19 {k1}, qword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherqq ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherqq ymm19 {k1}, qword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x91,0x9c,0xb9,0x00,0x04,0x00,0x00] - vpgatherqq ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vpgatherqq ymm19 {k1}, qword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherqq xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherqq xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0xbc,0xfe,0x85,0xff,0xff,0xff] - vpgatherqq xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vpgatherqq xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherqq xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherqq xmm23 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x7c,0x39,0x20] - vpgatherqq xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherqq xmm23 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherqq xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherqq xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x91,0xbc,0xb9,0x00,0x04,0x00,0x00] - vpgatherqq xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherqq xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqq ymm26 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vpgatherqq ymm26 {k1}, qword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x91,0x94,0xfe,0x85,0xff,0xff,0xff] - vpgatherqq ymm26 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] + vpgatherqq ymm26 {k1}, qword ptr [r14 + 8*ymm31 - 123] -// CHECK: vpgatherqq ymm26 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherqq ymm26 {k1}, qword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x91,0x54,0x39,0x20] - vpgatherqq ymm26 {k1}, ymmword ptr [r9 + ymm31 + 256] + vpgatherqq ymm26 {k1}, qword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherqq ymm26 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherqq ymm26 {k1}, qword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x91,0x94,0xb9,0x00,0x04,0x00,0x00] - vpgatherqq ymm26 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vpgatherqq ymm26 {k1}, qword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpscatterdd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 +// CHECK: vpscatterdd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 + vpscatterdd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 -// CHECK: vpscatterdd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 +// CHECK: vpscatterdd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 + vpscatterdd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 -// CHECK: vpscatterdd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm20 +// CHECK: vpscatterdd dword ptr [r9 + xmm31 + 256] {k1}, xmm20 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x64,0x39,0x40] - vpscatterdd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm20 + vpscatterdd dword ptr [r9 + xmm31 + 256] {k1}, xmm20 -// CHECK: vpscatterdd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 +// CHECK: vpscatterdd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 + vpscatterdd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 -// CHECK: vpscatterdd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 +// CHECK: vpscatterdd dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 + vpscatterdd dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 -// CHECK: vpscatterdd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 +// CHECK: vpscatterdd dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 + vpscatterdd dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 -// CHECK: vpscatterdd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm28 +// CHECK: vpscatterdd dword ptr [r9 + ymm31 + 256] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x64,0x39,0x40] - vpscatterdd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm28 + vpscatterdd dword ptr [r9 + ymm31 + 256] {k1}, ymm28 -// CHECK: vpscatterdd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 +// CHECK: vpscatterdd dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 + vpscatterdd dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 -// CHECK: vpscatterdd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 +// CHECK: vpscatterdd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpscatterdd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 + vpscatterdd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 -// CHECK: vpscatterdd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 +// CHECK: vpscatterdd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpscatterdd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 + vpscatterdd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 -// CHECK: vpscatterdd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm17 +// CHECK: vpscatterdd dword ptr [r9 + xmm31 + 256] {k1}, xmm17 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x4c,0x39,0x40] - vpscatterdd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm17 + vpscatterdd dword ptr [r9 + xmm31 + 256] {k1}, xmm17 -// CHECK: vpscatterdd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 +// CHECK: vpscatterdd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa0,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpscatterdd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 + vpscatterdd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 -// CHECK: vpscatterdd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 +// CHECK: vpscatterdd dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x84,0xfe,0x85,0xff,0xff,0xff] - vpscatterdd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 + vpscatterdd dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 -// CHECK: vpscatterdd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 +// CHECK: vpscatterdd dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x84,0xfe,0x85,0xff,0xff,0xff] - vpscatterdd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 + vpscatterdd dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 -// CHECK: vpscatterdd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm24 +// CHECK: vpscatterdd dword ptr [r9 + ymm31 + 256] {k1}, ymm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x44,0x39,0x40] - vpscatterdd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm24 + vpscatterdd dword ptr [r9 + ymm31 + 256] {k1}, ymm24 -// CHECK: vpscatterdd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 +// CHECK: vpscatterdd dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa0,0x84,0xb9,0x00,0x04,0x00,0x00] - vpscatterdd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 + vpscatterdd dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 -// CHECK: vpscatterdq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0xac,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 + vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 -// CHECK: vpscatterdq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0xac,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 + vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 -// CHECK: vpscatterdq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 +// CHECK: vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0x6c,0x39,0x20] - vpscatterdq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 + vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, xmm21 -// CHECK: vpscatterdq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 +// CHECK: vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa0,0xac,0xb9,0x00,0x04,0x00,0x00] - vpscatterdq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 + vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 -// CHECK: vpscatterdq ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdq ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 + vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 -// CHECK: vpscatterdq ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdq ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 + vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 -// CHECK: vpscatterdq ymmword ptr [r9 + xmm31 + 256] {k1}, ymm28 +// CHECK: vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0x64,0x39,0x20] - vpscatterdq ymmword ptr [r9 + xmm31 + 256] {k1}, ymm28 + vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, ymm28 -// CHECK: vpscatterdq ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 +// CHECK: vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdq ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 + vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 -// CHECK: vpscatterdq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpscatterdq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 + vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 -// CHECK: vpscatterdq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpscatterdq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 + vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 -// CHECK: vpscatterdq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 +// CHECK: vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0x64,0x39,0x20] - vpscatterdq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 + vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, xmm28 -// CHECK: vpscatterdq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 +// CHECK: vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 + vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 -// CHECK: vpscatterdq ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpscatterdq ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 + vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 -// CHECK: vpscatterdq ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpscatterdq ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 + vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 -// CHECK: vpscatterdq ymmword ptr [r9 + xmm31 + 256] {k1}, ymm20 +// CHECK: vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, ymm20 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0x64,0x39,0x20] - vpscatterdq ymmword ptr [r9 + xmm31 + 256] {k1}, ymm20 + vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, ymm20 -// CHECK: vpscatterdq ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 +// CHECK: vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdq ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 + vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 -// CHECK: vpscatterqd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 + vpscatterqd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 + vpscatterqd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [r9 + xmm31 + 256] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r9 + xmm31 + 256] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0x74,0x39,0x40] - vpscatterqd qword ptr [r9 + xmm31 + 256] {k1}, xmm22 + vpscatterqd dword ptr [r9 + xmm31 + 256] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa1,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpscatterqd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 + vpscatterqd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 -// CHECK: vpscatterqd xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 +// CHECK: vpscatterqd dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x84,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqd xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 + vpscatterqd dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 -// CHECK: vpscatterqd xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 +// CHECK: vpscatterqd dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x84,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqd xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 + vpscatterqd dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 -// CHECK: vpscatterqd xmmword ptr [r9 + ymm31 + 256] {k1}, xmm24 +// CHECK: vpscatterqd dword ptr [r9 + ymm31 + 256] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x44,0x39,0x40] - vpscatterqd xmmword ptr [r9 + ymm31 + 256] {k1}, xmm24 + vpscatterqd dword ptr [r9 + ymm31 + 256] {k1}, xmm24 -// CHECK: vpscatterqd xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 +// CHECK: vpscatterqd dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa1,0x84,0xb9,0x00,0x04,0x00,0x00] - vpscatterqd xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 + vpscatterqd dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 -// CHECK: vpscatterqd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpscatterqd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 + vpscatterqd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpscatterqd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 + vpscatterqd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [r9 + xmm31 + 256] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r9 + xmm31 + 256] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0x74,0x39,0x40] - vpscatterqd qword ptr [r9 + xmm31 + 256] {k1}, xmm22 + vpscatterqd dword ptr [r9 + xmm31 + 256] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa1,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpscatterqd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 + vpscatterqd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 -// CHECK: vpscatterqd xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 +// CHECK: vpscatterqd dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0xac,0xfe,0x85,0xff,0xff,0xff] - vpscatterqd xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 + vpscatterqd dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 -// CHECK: vpscatterqd xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 +// CHECK: vpscatterqd dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0xac,0xfe,0x85,0xff,0xff,0xff] - vpscatterqd xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 + vpscatterqd dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 -// CHECK: vpscatterqd xmmword ptr [r9 + ymm31 + 256] {k1}, xmm29 +// CHECK: vpscatterqd dword ptr [r9 + ymm31 + 256] {k1}, xmm29 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x6c,0x39,0x40] - vpscatterqd xmmword ptr [r9 + ymm31 + 256] {k1}, xmm29 + vpscatterqd dword ptr [r9 + ymm31 + 256] {k1}, xmm29 -// CHECK: vpscatterqd xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 +// CHECK: vpscatterqd dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa1,0xac,0xb9,0x00,0x04,0x00,0x00] - vpscatterqd xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 + vpscatterqd dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 -// CHECK: vpscatterqq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 +// CHECK: vpscatterqq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 + vpscatterqq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 -// CHECK: vpscatterqq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 +// CHECK: vpscatterqq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 + vpscatterqq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 -// CHECK: vpscatterqq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 +// CHECK: vpscatterqq qword ptr [r9 + xmm31 + 256] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x64,0x39,0x20] - vpscatterqq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 + vpscatterqq qword ptr [r9 + xmm31 + 256] {k1}, xmm28 -// CHECK: vpscatterqq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 +// CHECK: vpscatterqq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa1,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterqq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 + vpscatterqq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 -// CHECK: vpscatterqq ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 +// CHECK: vpscatterqq qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqq ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 + vpscatterqq qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 -// CHECK: vpscatterqq ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 +// CHECK: vpscatterqq qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqq ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 + vpscatterqq qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 -// CHECK: vpscatterqq ymmword ptr [r9 + ymm31 + 256] {k1}, ymm19 +// CHECK: vpscatterqq qword ptr [r9 + ymm31 + 256] {k1}, ymm19 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x5c,0x39,0x20] - vpscatterqq ymmword ptr [r9 + ymm31 + 256] {k1}, ymm19 + vpscatterqq qword ptr [r9 + ymm31 + 256] {k1}, ymm19 -// CHECK: vpscatterqq ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 +// CHECK: vpscatterqq qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa1,0x9c,0xb9,0x00,0x04,0x00,0x00] - vpscatterqq ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 + vpscatterqq qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 -// CHECK: vpscatterqq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 +// CHECK: vpscatterqq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x84,0xfe,0x85,0xff,0xff,0xff] - vpscatterqq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 + vpscatterqq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 -// CHECK: vpscatterqq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 +// CHECK: vpscatterqq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x84,0xfe,0x85,0xff,0xff,0xff] - vpscatterqq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 + vpscatterqq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 -// CHECK: vpscatterqq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 +// CHECK: vpscatterqq qword ptr [r9 + xmm31 + 256] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x44,0x39,0x20] - vpscatterqq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 + vpscatterqq qword ptr [r9 + xmm31 + 256] {k1}, xmm24 -// CHECK: vpscatterqq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 +// CHECK: vpscatterqq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa1,0x84,0xb9,0x00,0x04,0x00,0x00] - vpscatterqq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 + vpscatterqq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 -// CHECK: vpscatterqq ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 +// CHECK: vpscatterqq qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpscatterqq ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 + vpscatterqq qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 -// CHECK: vpscatterqq ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 +// CHECK: vpscatterqq qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpscatterqq ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 + vpscatterqq qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 -// CHECK: vpscatterqq ymmword ptr [r9 + ymm31 + 256] {k1}, ymm17 +// CHECK: vpscatterqq qword ptr [r9 + ymm31 + 256] {k1}, ymm17 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x4c,0x39,0x20] - vpscatterqq ymmword ptr [r9 + ymm31 + 256] {k1}, ymm17 + vpscatterqq qword ptr [r9 + ymm31 + 256] {k1}, ymm17 -// CHECK: vpscatterqq ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 +// CHECK: vpscatterqq qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa1,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpscatterqq ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 + vpscatterqq qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 -// CHECK: vscatterdpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x94,0xfe,0x7b,0x00,0x00,0x00] - vscatterdpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 + vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 -// CHECK: vscatterdpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x94,0xfe,0x7b,0x00,0x00,0x00] - vscatterdpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 + vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 -// CHECK: vscatterdpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm18 +// CHECK: vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, xmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x54,0x39,0x20] - vscatterdpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm18 + vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, xmm18 -// CHECK: vscatterdpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 +// CHECK: vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa2,0x94,0xb9,0x00,0x04,0x00,0x00] - vscatterdpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 + vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 -// CHECK: vscatterdpd ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0xb4,0xfe,0x7b,0x00,0x00,0x00] - vscatterdpd ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 + vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 -// CHECK: vscatterdpd ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0xb4,0xfe,0x7b,0x00,0x00,0x00] - vscatterdpd ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 + vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 -// CHECK: vscatterdpd ymmword ptr [r9 + xmm31 + 256] {k1}, ymm30 +// CHECK: vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, ymm30 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x74,0x39,0x20] - vscatterdpd ymmword ptr [r9 + xmm31 + 256] {k1}, ymm30 + vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, ymm30 -// CHECK: vscatterdpd ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 +// CHECK: vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa2,0xb4,0xb9,0x00,0x04,0x00,0x00] - vscatterdpd ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 + vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 -// CHECK: vscatterdpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterdpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 + vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 -// CHECK: vscatterdpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterdpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 + vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 -// CHECK: vscatterdpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 +// CHECK: vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x5c,0x39,0x20] - vscatterdpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 + vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, xmm19 -// CHECK: vscatterdpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 +// CHECK: vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa2,0x9c,0xb9,0x00,0x04,0x00,0x00] - vscatterdpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 + vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 -// CHECK: vscatterdpd ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x94,0xfe,0x85,0xff,0xff,0xff] - vscatterdpd ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 + vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 -// CHECK: vscatterdpd ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x94,0xfe,0x85,0xff,0xff,0xff] - vscatterdpd ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 + vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 -// CHECK: vscatterdpd ymmword ptr [r9 + xmm31 + 256] {k1}, ymm26 +// CHECK: vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, ymm26 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x54,0x39,0x20] - vscatterdpd ymmword ptr [r9 + xmm31 + 256] {k1}, ymm26 + vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, ymm26 -// CHECK: vscatterdpd ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 +// CHECK: vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa2,0x94,0xb9,0x00,0x04,0x00,0x00] - vscatterdpd ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 + vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 -// CHECK: vscatterdps xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 +// CHECK: vscatterdps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x84,0xfe,0x7b,0x00,0x00,0x00] - vscatterdps xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 + vscatterdps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 -// CHECK: vscatterdps xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 +// CHECK: vscatterdps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x84,0xfe,0x7b,0x00,0x00,0x00] - vscatterdps xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 + vscatterdps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 -// CHECK: vscatterdps xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 +// CHECK: vscatterdps dword ptr [r9 + xmm31 + 256] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x44,0x39,0x40] - vscatterdps xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 + vscatterdps dword ptr [r9 + xmm31 + 256] {k1}, xmm24 -// CHECK: vscatterdps xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 +// CHECK: vscatterdps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa2,0x84,0xb9,0x00,0x04,0x00,0x00] - vscatterdps xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 + vscatterdps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 -// CHECK: vscatterdps ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 +// CHECK: vscatterdps dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vscatterdps ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 + vscatterdps dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 -// CHECK: vscatterdps ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 +// CHECK: vscatterdps dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vscatterdps ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 + vscatterdps dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 -// CHECK: vscatterdps ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 +// CHECK: vscatterdps dword ptr [r9 + ymm31 + 256] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0x7c,0x39,0x40] - vscatterdps ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 + vscatterdps dword ptr [r9 + ymm31 + 256] {k1}, ymm23 -// CHECK: vscatterdps ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 +// CHECK: vscatterdps dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0xa2,0xbc,0xb9,0x00,0x04,0x00,0x00] - vscatterdps ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 + vscatterdps dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 -// CHECK: vscatterdps xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 +// CHECK: vscatterdps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0xa4,0xfe,0x85,0xff,0xff,0xff] - vscatterdps xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 + vscatterdps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 -// CHECK: vscatterdps xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 +// CHECK: vscatterdps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0xa4,0xfe,0x85,0xff,0xff,0xff] - vscatterdps xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 + vscatterdps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 -// CHECK: vscatterdps xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 +// CHECK: vscatterdps dword ptr [r9 + xmm31 + 256] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x64,0x39,0x40] - vscatterdps xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 + vscatterdps dword ptr [r9 + xmm31 + 256] {k1}, xmm28 -// CHECK: vscatterdps xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 +// CHECK: vscatterdps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa2,0xa4,0xb9,0x00,0x04,0x00,0x00] - vscatterdps xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 + vscatterdps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 -// CHECK: vscatterdps ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 +// CHECK: vscatterdps dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x8c,0xfe,0x85,0xff,0xff,0xff] - vscatterdps ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 + vscatterdps dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 -// CHECK: vscatterdps ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 +// CHECK: vscatterdps dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x8c,0xfe,0x85,0xff,0xff,0xff] - vscatterdps ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 + vscatterdps dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 -// CHECK: vscatterdps ymmword ptr [r9 + ymm31 + 256] {k1}, ymm25 +// CHECK: vscatterdps dword ptr [r9 + ymm31 + 256] {k1}, ymm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x4c,0x39,0x40] - vscatterdps ymmword ptr [r9 + ymm31 + 256] {k1}, ymm25 + vscatterdps dword ptr [r9 + ymm31 + 256] {k1}, ymm25 -// CHECK: vscatterdps ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 +// CHECK: vscatterdps dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa2,0x8c,0xb9,0x00,0x04,0x00,0x00] - vscatterdps ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 + vscatterdps dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 -// CHECK: vscatterqpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 +// CHECK: vscatterqpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0xac,0xfe,0x7b,0x00,0x00,0x00] - vscatterqpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 + vscatterqpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 -// CHECK: vscatterqpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 +// CHECK: vscatterqpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0xac,0xfe,0x7b,0x00,0x00,0x00] - vscatterqpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 + vscatterqpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 -// CHECK: vscatterqpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 +// CHECK: vscatterqpd qword ptr [r9 + xmm31 + 256] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x6c,0x39,0x20] - vscatterqpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 + vscatterqpd qword ptr [r9 + xmm31 + 256] {k1}, xmm21 -// CHECK: vscatterqpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 +// CHECK: vscatterqpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa3,0xac,0xb9,0x00,0x04,0x00,0x00] - vscatterqpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 + vscatterqpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 -// CHECK: vscatterqpd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 +// CHECK: vscatterqpd qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vscatterqpd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 + vscatterqpd qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 -// CHECK: vscatterqpd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 +// CHECK: vscatterqpd qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vscatterqpd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 + vscatterqpd qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 -// CHECK: vscatterqpd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 +// CHECK: vscatterqpd qword ptr [r9 + ymm31 + 256] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0x7c,0x39,0x20] - vscatterqpd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 + vscatterqpd qword ptr [r9 + ymm31 + 256] {k1}, ymm23 -// CHECK: vscatterqpd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 +// CHECK: vscatterqpd qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa3,0xbc,0xb9,0x00,0x04,0x00,0x00] - vscatterqpd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 + vscatterqpd qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 -// CHECK: vscatterqpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 +// CHECK: vscatterqpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterqpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 + vscatterqpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 -// CHECK: vscatterqpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 +// CHECK: vscatterqpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterqpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 + vscatterqpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 -// CHECK: vscatterqpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 +// CHECK: vscatterqpd qword ptr [r9 + xmm31 + 256] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x5c,0x39,0x20] - vscatterqpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 + vscatterqpd qword ptr [r9 + xmm31 + 256] {k1}, xmm19 -// CHECK: vscatterqpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 +// CHECK: vscatterqpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa3,0x9c,0xb9,0x00,0x04,0x00,0x00] - vscatterqpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 + vscatterqpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 -// CHECK: vscatterqpd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 +// CHECK: vscatterqpd qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0xac,0xfe,0x85,0xff,0xff,0xff] - vscatterqpd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 + vscatterqpd qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 -// CHECK: vscatterqpd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 +// CHECK: vscatterqpd qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0xac,0xfe,0x85,0xff,0xff,0xff] - vscatterqpd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 + vscatterqpd qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 -// CHECK: vscatterqpd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm29 +// CHECK: vscatterqpd qword ptr [r9 + ymm31 + 256] {k1}, ymm29 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0x6c,0x39,0x20] - vscatterqpd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm29 + vscatterqpd qword ptr [r9 + ymm31 + 256] {k1}, ymm29 -// CHECK: vscatterqpd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 +// CHECK: vscatterqpd qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa3,0xac,0xb9,0x00,0x04,0x00,0x00] - vscatterqpd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 + vscatterqpd qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 -// CHECK: vscatterqps qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 +// CHECK: vscatterqps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vscatterqps qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 + vscatterqps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 -// CHECK: vscatterqps qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 +// CHECK: vscatterqps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vscatterqps qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 + vscatterqps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 -// CHECK: vscatterqps qword ptr [r9 + xmm31 + 256] {k1}, xmm28 +// CHECK: vscatterqps dword ptr [r9 + xmm31 + 256] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x64,0x39,0x40] - vscatterqps qword ptr [r9 + xmm31 + 256] {k1}, xmm28 + vscatterqps dword ptr [r9 + xmm31 + 256] {k1}, xmm28 -// CHECK: vscatterqps qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 +// CHECK: vscatterqps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa3,0xa4,0xb9,0x00,0x04,0x00,0x00] - vscatterqps qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 + vscatterqps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 -// CHECK: vscatterqps xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 +// CHECK: vscatterqps dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vscatterqps xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 + vscatterqps dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 -// CHECK: vscatterqps xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 +// CHECK: vscatterqps dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vscatterqps xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 + vscatterqps dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 -// CHECK: vscatterqps xmmword ptr [r9 + ymm31 + 256] {k1}, xmm25 +// CHECK: vscatterqps dword ptr [r9 + ymm31 + 256] {k1}, xmm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x4c,0x39,0x40] - vscatterqps xmmword ptr [r9 + ymm31 + 256] {k1}, xmm25 + vscatterqps dword ptr [r9 + ymm31 + 256] {k1}, xmm25 -// CHECK: vscatterqps xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 +// CHECK: vscatterqps dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa3,0x8c,0xb9,0x00,0x04,0x00,0x00] - vscatterqps xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 + vscatterqps dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 -// CHECK: vscatterqps qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 +// CHECK: vscatterqps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterqps qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 + vscatterqps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 -// CHECK: vscatterqps qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 +// CHECK: vscatterqps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterqps qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 + vscatterqps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 -// CHECK: vscatterqps qword ptr [r9 + xmm31 + 256] {k1}, xmm27 +// CHECK: vscatterqps dword ptr [r9 + xmm31 + 256] {k1}, xmm27 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x5c,0x39,0x40] - vscatterqps qword ptr [r9 + xmm31 + 256] {k1}, xmm27 + vscatterqps dword ptr [r9 + xmm31 + 256] {k1}, xmm27 -// CHECK: vscatterqps qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 +// CHECK: vscatterqps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa3,0x9c,0xb9,0x00,0x04,0x00,0x00] - vscatterqps qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 + vscatterqps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 -// CHECK: vscatterqps xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 +// CHECK: vscatterqps dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0xbc,0xfe,0x85,0xff,0xff,0xff] - vscatterqps xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 + vscatterqps dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 -// CHECK: vscatterqps xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 +// CHECK: vscatterqps dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0xbc,0xfe,0x85,0xff,0xff,0xff] - vscatterqps xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 + vscatterqps dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 -// CHECK: vscatterqps xmmword ptr [r9 + ymm31 + 256] {k1}, xmm23 +// CHECK: vscatterqps dword ptr [r9 + ymm31 + 256] {k1}, xmm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0x7c,0x39,0x40] - vscatterqps xmmword ptr [r9 + ymm31 + 256] {k1}, xmm23 + vscatterqps dword ptr [r9 + ymm31 + 256] {k1}, xmm23 -// CHECK: vscatterqps xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 +// CHECK: vscatterqps dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0xa3,0xbc,0xb9,0x00,0x04,0x00,0x00] - vscatterqps xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 + vscatterqps dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 // CHECK: vcvtpd2ps xmm0, xmm23 // CHECK: encoding: [0x62,0xb1,0xfd,0x08,0x5a,0xc7] diff --git a/llvm/test/MC/X86/intel-syntax.s b/llvm/test/MC/X86/intel-syntax.s index 2b365699eec7b..c622832d24bea 100644 --- a/llvm/test/MC/X86/intel-syntax.s +++ b/llvm/test/MC/X86/intel-syntax.s @@ -144,7 +144,7 @@ main: // CHECK: vshufpd $1, %xmm2, %xmm1, %xmm0 vshufpd XMM0, XMM1, XMM2, 1 // CHECK: vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm1 - vpgatherdd XMM10, XMMWORD PTR [R15 + 2*XMM9], XMM8 + vpgatherdd XMM10, DWORD PTR [R15 + 2*XMM9], XMM8 // CHECK: movsd -8, %xmm5 movsd XMM5, QWORD PTR [-8] // CHECK: movsl (%rsi), %es:(%rdi) diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index 52f0adf02a396..2d7bc49b6dcae 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -34,6 +34,23 @@ define i32 @foo1(ptr %a) #0 { ret i32 %t0 } +define i32 @align_assume_trunc_cond(ptr %a) #0 { +; CHECK-LABEL: @align_assume_trunc_cond( +; CHECK-NEXT: [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[PTRINT]] to i1 +; CHECK-NEXT: [[MASKCOND:%.*]] = xor i1 [[TRUNC]], true +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: ret i32 [[T0]] +; + %t0 = load i32, ptr %a, align 4 + %ptrint = ptrtoint ptr %a to i64 + %trunc = trunc i64 %ptrint to i1 + %maskcond = xor i1 %trunc, true + tail call void @llvm.assume(i1 %maskcond) + ret i32 %t0 +} + ; Same check as in @foo1, but make sure it works if the assume is first too. define i32 @foo2(ptr %a) #0 { diff --git a/llvm/test/Transforms/InstCombine/cttz.ll b/llvm/test/Transforms/InstCombine/cttz.ll index cb0bc59ae7995..829213b24e93e 100644 --- a/llvm/test/Transforms/InstCombine/cttz.ll +++ b/llvm/test/Transforms/InstCombine/cttz.ll @@ -297,3 +297,96 @@ define i16 @cttz_assume(i16 %x) { %cttz = call i16 @llvm.cttz.i16(i16 %x, i1 false) ret i16 %cttz } + + +declare void @use.i8(i8) +define i8 @fold_ctz_log2(i8 %x) { +; CHECK-LABEL: @fold_ctz_log2( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 5) +; CHECK-NEXT: ret i8 [[R]] +; + %p2 = shl i8 1, %x + %v = call i8 @llvm.umin(i8 %p2, i8 32) + %r = call i8 @llvm.cttz(i8 %v, i1 false) + ret i8 %r +} + +define i9 @fold_ctz_log2_i9_okay(i9 %x) { +; CHECK-LABEL: @fold_ctz_log2_i9_okay( +; CHECK-NEXT: [[R:%.*]] = call i9 @llvm.umin.i9(i9 [[X:%.*]], i9 5) +; CHECK-NEXT: ret i9 [[R]] +; + %p2 = shl i9 1, %x + %v = call i9 @llvm.umin(i9 %p2, i9 32) + %r = call i9 @llvm.cttz(i9 %v, i1 false) + ret i9 %r +} + +define i8 @fold_ctz_log2_maybe_z(i8 %x, i8 %y, i1 %c) { +; CHECK-LABEL: @fold_ctz_log2_maybe_z( +; CHECK-NEXT: [[V:%.*]] = shl i8 2, [[V_V:%.*]] +; CHECK-NEXT: [[P2_2:%.*]] = shl i8 4, [[Y:%.*]] +; CHECK-NEXT: [[V1:%.*]] = select i1 [[C:%.*]], i8 [[V]], i8 [[P2_2]] +; CHECK-NEXT: [[R:%.*]] = call range(i8 1, 9) i8 @llvm.cttz.i8(i8 [[V1]], i1 false) +; CHECK-NEXT: ret i8 [[R]] +; + %p2 = shl i8 2, %x + %p2_2 = shl i8 4, %y + %v = select i1 %c, i8 %p2, i8 %p2_2 + %r = call i8 @llvm.cttz(i8 %v, i1 false) + ret i8 %r +} + +define i8 @fold_ctz_log2_maybe_z_okay(i8 %x, i8 %y, i1 %c) { +; CHECK-LABEL: @fold_ctz_log2_maybe_z_okay( +; CHECK-NEXT: [[X:%.*]] = add i8 [[X1:%.*]], 1 +; CHECK-NEXT: [[Y:%.*]] = add i8 [[Y1:%.*]], 2 +; CHECK-NEXT: [[V_V:%.*]] = select i1 [[C:%.*]], i8 [[X]], i8 [[Y]] +; CHECK-NEXT: ret i8 [[V_V]] +; + %p2 = shl i8 2, %x + %p2_2 = shl i8 4, %y + %v = select i1 %c, i8 %p2, i8 %p2_2 + %r = call i8 @llvm.cttz(i8 %v, i1 true) + ret i8 %r +} + +define i8 @fold_clz_log2(i8 %x) { +; CHECK-LABEL: @fold_clz_log2( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 5) +; CHECK-NEXT: [[R:%.*]] = xor i8 [[TMP1]], 7 +; CHECK-NEXT: ret i8 [[R]] +; + %p2 = shl i8 1, %x + %v = call i8 @llvm.umin(i8 %p2, i8 32) + %r = call i8 @llvm.ctlz(i8 %v, i1 false) + ret i8 %r +} + +define i8 @fold_clz_log2_multiuse_fail(i8 %x) { +; CHECK-LABEL: @fold_clz_log2_multiuse_fail( +; CHECK-NEXT: [[P2:%.*]] = shl nuw i8 2, [[X:%.*]] +; CHECK-NEXT: [[V:%.*]] = call i8 @llvm.umin.i8(i8 [[P2]], i8 32) +; CHECK-NEXT: call void @use.i8(i8 [[V]]) +; CHECK-NEXT: [[R:%.*]] = call range(i8 2, 9) i8 @llvm.ctlz.i8(i8 [[V]], i1 true) +; CHECK-NEXT: ret i8 [[R]] +; + %p2 = shl nuw i8 2, %x + %v = call i8 @llvm.umin(i8 %p2, i8 32) + call void @use.i8(i8 %v) + %r = call i8 @llvm.ctlz(i8 %v, i1 true) + ret i8 %r +} + + +define i9 @fold_clz_log2_i9(i9 %x) { +; CHECK-LABEL: @fold_clz_log2_i9( +; CHECK-NEXT: [[TMP1:%.*]] = call i9 @llvm.umin.i9(i9 [[X:%.*]], i9 5) +; CHECK-NEXT: [[R:%.*]] = sub nuw nsw i9 8, [[TMP1]] +; CHECK-NEXT: ret i9 [[R]] +; + %p2 = shl i9 1, %x + %v = call i9 @llvm.umin(i9 %p2, i9 32) + %r = call i9 @llvm.ctlz(i9 %v, i1 true) + ret i9 %r +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 1cfb507a74344..c3e8c895fce24 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 26 +; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 48 +; CHECK: Cost for VF 16: 56 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body @@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %mul, %sum + %div = udiv i64 %conv3, %conv + %add = add i64 %div, %sum %i.iv.next = add nuw nsw i64 %i.iv, 1 %exitcond.not = icmp eq i64 %i.iv.next, 16 br i1 %exitcond.not, label %exit, label %for.body @@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 26 +; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 49 +; CHECK: Cost for VF 16: 57 ; CHECK: LV: Selecting VF: vscale x 2 entry: br label %for.body @@ -64,8 +64,8 @@ for.body: ; preds = %entry, %for.body %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %sum, %mul + %div = udiv i64 %conv3, %conv + %add = add i64 %sum, %div %exitcond.not = icmp eq i64 %i.iv.next, 16 br i1 %exitcond.not, label %exit, label %for.body @@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 27 +; CHECK: Cost for VF 8: 24 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 48 +; CHECK: Cost for VF 16: 42 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll new file mode 100644 index 0000000000000..5cc00daab7ce5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -0,0 +1,213 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -mattr=+dotprod -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-LABEL: define i32 @dotp( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = zext [[WIDE_LOAD5]] to +; CHECK-NEXT: [[TMP26:%.*]] = mul [[TMP25]], [[TMP22]] +; CHECK-NEXT: [[TMP27]] = add [[TMP26]], [[VEC_PHI3]] +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP27]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { +; CHECK-LABEL: define void @dotp_small_epilogue_vf( +; CHECK-SAME: i64 [[IDX_NEG:%.*]], i8 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1, [[IDX_NEG]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i64> +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[ADD:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT7]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT8]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i64> zeroinitializer, i64 [[ACCUM]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <1 x i64> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT12]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = mul <8 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[PARTIAL_REDUCE13]] = call <1 x i64> @llvm.experimental.vector.partial.reduce.add.v1i64.v8i64(<1 x i64> [[VEC_PHI10]], <8 x i64> [[TMP11]]) +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> [[PARTIAL_REDUCE13]]) +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IDX_NEG]], [[ITER_CHECK:%.*]] ], [ [[IND_END6]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IV_NEXT]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[ADD]], [[WHILE_BODY]] ] +; CHECK-NEXT: br label [[WHILE_BODY1:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[IV_NEG:%.*]] = phi i64 [ [[IV_NEG_NEXT:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[ACCUM1:%.*]] = phi i64 [ [[ADD1:%.*]], [[WHILE_BODY1]] ], [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[IV_NEG_NEXT]] = add i64 [[IV_NEG]], 1 +; CHECK-NEXT: [[EXT_A:%.*]] = sext i8 [[A]] to i64 +; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 +; CHECK-NEXT: [[B:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = sext i8 [[B]] to i64 +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD1]] = add i64 [[MUL]], [[ACCUM1]] +; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0 +; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[IV1]], -1 +; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[RESULT:%.*]] = phi i64 [ [[ADD1]], [[WHILE_BODY1]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret void +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %iv.neg = phi i64 [ %iv.neg.next, %while.body ], [ %idx.neg, %entry ] + %iv = phi i64 [ %iv.next, %while.body ], [ 0, %entry ] + %accum = phi i64 [ %add, %while.body ], [ 0, %entry ] + %iv.neg.next = add i64 %iv.neg, 1 + %ext.a = sext i8 %a to i64 + %iv.next = add i64 %iv, 1 + %b = load i8, ptr null, align 1 + %ext.b = sext i8 %b to i64 + %mul = mul i64 %ext.b, %ext.a + %add = add i64 %mul, %accum + %cmp.iv.neg = icmp ugt i64 %iv.neg, 0 + %cmp.iv = icmp ne i64 %iv, -1 + %exitcond = and i1 %cmp.iv.neg, %cmp.iv + br i1 %exitcond, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + %result = phi i64 [ %add, %while.body ] + ret void +} + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } +attributes #1 = { "target-cpu"="apple-m1" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll new file mode 100644 index 0000000000000..74db8683d5df8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -0,0 +1,206 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+i8mm,+dotprod -S < %s | FileCheck %s +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+dotprod -S < %s | FileCheck %s --check-prefix=CHECK-NOI8MM + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp_z_s(ptr %a, ptr %b) #0 { +; CHECK-LABEL: define i32 @dotp_z_s( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-NOI8MM-LABEL: define i32 @dotp_z_s( +; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NOI8MM-NEXT: entry: +; CHECK-NOI8MM-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NOI8MM: vector.ph: +; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NOI8MM: vector.body: +; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM: middle.block: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @dotp_s_z(ptr %a, ptr %b) #0 { +; CHECK-LABEL: define i32 @dotp_s_z( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-NOI8MM-LABEL: define i32 @dotp_s_z( +; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NOI8MM-NEXT: entry: +; CHECK-NOI8MM-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NOI8MM: vector.ph: +; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NOI8MM: vector.body: +; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM: middle.block: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll new file mode 100644 index 0000000000000..c66695f1b50f0 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -0,0 +1,1375 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_different_types(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]] +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-MAXBW-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i16, ptr %gep.b, align 2 + %ext.b = zext i16 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_phi(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %ext.b + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( +; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] + %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] + %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] + %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] + %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv + %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv + %offset.1 = or disjoint i64 %iv, 1 + %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 + %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 + %offset.2 = or disjoint i64 %iv, 2 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 + %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 + %offset.3 = or disjoint i64 %iv, 3 + %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 + %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 + %load.a0 = load i8, ptr %gep.a0, align 1 + %ext.a0 = sext i8 %load.a0 to i32 + %load.b0 = load i8, ptr %gep.b0, align 1 + %ext.b0 = sext i8 %load.b0 to i32 + %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 + %add.a0 = add nsw i32 %mul.a0, %accum0 + %load.a1 = load i8, ptr %gep.a1, align 1 + %ext.a1 = sext i8 %load.a1 to i32 + %load.b1 = load i8, ptr %gep.b1, align 1 + %ext.b1 = sext i8 %load.b1 to i32 + %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 + %add.a1 = add nsw i32 %mul.a1, %accum1 + %load.a2 = load i8, ptr %gep.a2, align 1 + %ext.a2 = sext i8 %load.a2 to i32 + %load.b2 = load i8, ptr %gep.b2, align 1 + %ext.b2 = sext i8 %load.b2 to i32 + %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 + %add.a2 = add nsw i32 %mul.a2, %accum2 + %load.a3 = load i8, ptr %gep.a3, align 1 + %ext.a3 = sext i8 %load.a3 to i32 + %load.b3 = load i8, ptr %gep.b3, align 1 + %ext.b3 = sext i8 %load.b3 to i32 + %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 + %add.a3 = add nsw i32 %mul.a3, %accum3 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %num_in + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %result0 = add nsw i32 %add.a0, %add.a1 + %result1 = add nsw i32 %add.a2, %add.a3 + %result = add nsw i32 %result0, %result1 + ret i32 %result +} + +define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv + %load.b = load i8, ptr %gep.a2, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7 + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_extend_user(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %result = add i32 %add, %ext.b + ret i32 %result +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll new file mode 100644 index 0000000000000..af2a7b966f700 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -0,0 +1,2164 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP18]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP29]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP30]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]] +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] +; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2 +; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2 +; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i16, ptr %gep.b, align 2 + %ext.b = zext i16 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul [[TMP15]], [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call @llvm.vector.splice.nxv8i32( [[VECTOR_RECUR]], [[TMP16]], i32 -1) +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP17]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul [[TMP22]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul [[TMP23]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call @llvm.vector.splice.nxv8i32( [[TMP24]], [[TMP25]], i32 -1) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-MAXBW-NEXT: [[TMP25]] = mul [[TMP23]], [[TMP16]] +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call @llvm.vector.splice.nxv8i32( [[VECTOR_RECUR]], [[TMP25]], i32 -1) +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul [[TMP15]], [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add [[TMP16]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP22]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add [[TMP30]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP21]] = add [[TMP20]], [[TMP19]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %ext.b + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw [[TMP21]], [[TMP36]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add [[TMP38]], [[VEC_PHI3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD5]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw [[TMP25]], [[TMP42]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add [[TMP28]], [[VEC_PHI2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext [[WIDE_LOAD7]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw [[TMP31]], [[TMP33]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add [[TMP34]], [[VEC_PHI1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP27]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP29]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD10]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw [[TMP37]], [[TMP39]] +; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add [[TMP40]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP34]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP56]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP72]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext [[WIDE_LOAD10]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw [[TMP28]], [[TMP66]] +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw [[TMP82]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add [[TMP30]], [[VEC_PHI6]] +; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add [[TMP31]], [[VEC_PHI7]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP37]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext [[WIDE_LOAD11]] to +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP43]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext [[WIDE_LOAD13]] to +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext [[WIDE_LOAD14]] to +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw [[TMP38]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw [[TMP39]], [[TMP45]] +; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add [[TMP46]], [[VEC_PHI4]] +; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add [[TMP47]], [[VEC_PHI5]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load , ptr [[TMP53]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext [[WIDE_LOAD15]] to +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext [[WIDE_LOAD16]] to +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP59]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext [[WIDE_LOAD17]] to +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext [[WIDE_LOAD18]] to +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw [[TMP54]], [[TMP60]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw [[TMP55]], [[TMP61]] +; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add [[TMP62]], [[VEC_PHI2]] +; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add [[TMP63]], [[VEC_PHI3]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load , ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP69]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext [[WIDE_LOAD19]] to +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD20]] to +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = mul i64 [[TMP73]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load , ptr [[TMP29]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load , ptr [[TMP75]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext [[WIDE_LOAD21]] to +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext [[WIDE_LOAD22]] to +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw [[TMP70]], [[TMP76]] +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw [[TMP71]], [[TMP77]] +; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add [[TMP78]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add [[TMP79]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]] +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( +; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw [[TMP29]], [[TMP23]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE11]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI7]], [[TMP31]]) +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP32]], align 1 +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP38]], align 1 +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = sext [[WIDE_LOAD14]] to +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = mul nsw [[TMP37]], [[TMP43]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI6]], [[TMP45]]) +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP46]], align 1 +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = sext [[WIDE_LOAD18]] to +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP52]], align 1 +; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = sext [[WIDE_LOAD20]] to +; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = mul nsw [[TMP51]], [[TMP57]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI5]], [[TMP59]]) +; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD24:%.*]] = load , ptr [[TMP60]], align 1 +; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = sext [[WIDE_LOAD24]] to +; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD26:%.*]] = load , ptr [[TMP66]], align 1 +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD26]] to +; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = mul nsw [[TMP65]], [[TMP71]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI4]], [[TMP73]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] + %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] + %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] + %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] + %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv + %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv + %offset.1 = or disjoint i64 %iv, 1 + %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 + %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 + %offset.2 = or disjoint i64 %iv, 2 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 + %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 + %offset.3 = or disjoint i64 %iv, 3 + %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 + %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 + %load.a0 = load i8, ptr %gep.a0, align 1 + %ext.a0 = sext i8 %load.a0 to i32 + %load.b0 = load i8, ptr %gep.b0, align 1 + %ext.b0 = sext i8 %load.b0 to i32 + %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 + %add.a0 = add nsw i32 %mul.a0, %accum0 + %load.a1 = load i8, ptr %gep.a1, align 1 + %ext.a1 = sext i8 %load.a1 to i32 + %load.b1 = load i8, ptr %gep.b1, align 1 + %ext.b1 = sext i8 %load.b1 to i32 + %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 + %add.a1 = add nsw i32 %mul.a1, %accum1 + %load.a2 = load i8, ptr %gep.a2, align 1 + %ext.a2 = sext i8 %load.a2 to i32 + %load.b2 = load i8, ptr %gep.b2, align 1 + %ext.b2 = sext i8 %load.b2 to i32 + %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 + %add.a2 = add nsw i32 %mul.a2, %accum2 + %load.a3 = load i8, ptr %gep.a3, align 1 + %ext.a3 = sext i8 %load.a3 to i32 + %load.b3 = load i8, ptr %gep.b3, align 1 + %ext.b3 = sext i8 %load.b3 to i32 + %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 + %add.a3 = add nsw i32 %mul.a3, %accum3 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %num_in + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %result0 = add nsw i32 %add.a0, %add.a1 + %result1 = add nsw i32 %add.a2, %add.a3 + %result = add nsw i32 %result0, %result1 + ret i32 %result +} + +define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP10]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw [[TMP12]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP8]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP15]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw [[TMP20]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv + %load.b = load i8, ptr %gep.a2, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7 + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %result = add i32 %add, %ext.b + ret i32 %result +} + +define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw [[TMP20]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw [[TMP21]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[VEC_PHI]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI1]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64( [[VEC_PHI]], [[TMP14]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] + %sum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv + %0 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %0 to i64 + %i.iv.next = add nuw nsw i64 %i.iv, 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next + %1 = load i8, ptr %arrayidx2, align 1 + %conv3 = zext i8 %1 to i64 + %mul = mul nuw nsw i64 %conv3, %conv + %add = add i64 %sum, %mul + %exitcond.not = icmp eq i64 %i.iv.next, 16 + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i64 %add +} + +define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_phi2( +; CHECK-INTERLEAVE1-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-INTERLEAVE1: for.preheader: +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr null, align 1 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-INTERLEAVE1-NEXT: [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_1]] = add i32 [[MUL_1]], [[ADD]] +; CHECK-INTERLEAVE1-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16 +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; +; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi2( +; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-INTERLEAVED: for.preheader: +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr null, align 1 +; CHECK-INTERLEAVED-NEXT: [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-INTERLEAVED-NEXT: [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-INTERLEAVED-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 16 +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext i8 [[TMP8]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sext i8 [[TMP9]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw i32 [[A_EXT]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nsw i32 [[A_EXT]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[VEC_PHI2]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext i8 [[TMP16]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext i8 [[TMP17]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add i32 [[TMP20]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define void @not_dotp_not_phi2( +; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-MAXBW: for.preheader: +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr null, align 1 +; CHECK-MAXBW-NEXT: [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1 +; CHECK-MAXBW-NEXT: [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2 +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-MAXBW-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]] +; CHECK-MAXBW-NEXT: [[ADD_1]] = add i32 [[MUL_1]], [[ADD]] +; CHECK-MAXBW-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16 +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; +entry: + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %for.preheader, label %exit + +for.preheader: ; preds = %entry + %load.a = load i8, ptr inttoptr (i64 0 to ptr), align 1 + %load.a1 = load i8, ptr inttoptr (i64 1 to ptr), align 1 + %a.ext = sext i8 %load.a to i32 + %a.ext1 = sext i8 %load.a1 to i32 + br label %for.body + +for.body: ; preds = %for.preheader, %for.body + %iv = phi i32 [ %iv.next, %for.body ], [ 0, %for.preheader ] + %ptr = phi ptr [ %scevgep, %for.body ], [ %matrix, %for.preheader ] + %accum = phi i32 [ %add.1, %for.body ], [ 0, %for.preheader ] + %gep.b = getelementptr i8, ptr %ptr, i64 1 + %gep.b1 = getelementptr i8, ptr %ptr, i64 2 + %load.b = load i8, ptr %gep.b, align 1 + %b.ext = sext i8 %load.b to i32 + %mul = mul nsw i32 %a.ext, %b.ext + %add = add i32 %mul, %accum + %load.b1 = load i8, ptr %gep.b1, align 1 + %b.ext1 = sext i8 %load.b1 to i32 + %mul.1 = mul nsw i32 %a.ext1, %b.ext1 + %add.1 = add i32 %mul.1, %add + %scevgep = getelementptr i8, ptr %ptr, i64 16 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv.next, %n + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %add.1.lcssa = phi i32 [ %add.1, %for.body ] + %add.float = sitofp i32 %add.1.lcssa to float + br label %exit + +exit: ; preds = %for.exit, %entry + %result = phi float [ 0.000000e+00, %entry ], [ %add.float, %for.exit ] + store float %result, ptr %matrix, align 4 + ret void +} + +define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-INTERLEAVE1: for.ph: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP3]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-INTERLEAVED: for.ph: +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <8 x i64> [[TMP6]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-MAXBW: for.ph: +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 2 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw [[TMP9]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp = icmp eq i64 %n, 0 + br i1 %cmp, label %exit, label %for.ph + +for.ph: ; preds = %entry + %ext.b = zext i16 %b to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %for.ph ], [ %add, %for.body ] + %gep.a = getelementptr inbounds nuw i16, ptr %a, i64 %iv + %load.a = load i16, ptr %gep.a, align 2 + %ext.a = zext i16 %load.a to i64 + %mul = mul nuw nsw i64 %ext.a, %ext.b + %add = add i64 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %cmp.1 = icmp eq i64 %iv.next, %n + br i1 %cmp.1, label %exit, label %for.body + +exit: ; preds = %for.cond.cleanup.loopexit, %entry + %result = phi i64 [ 0, %entry ], [ %add, %for.body ] + ret i64 %result +} + +define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan2( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-INTERLEAVE1: for.ph: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan2( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-INTERLEAVED: for.ph: +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <8 x i64> [[TMP6]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan2( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-MAXBW: for.ph: +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 2 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw [[BROADCAST_SPLAT]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp = icmp eq i64 %n, 0 + br i1 %cmp, label %exit, label %for.ph + +for.ph: ; preds = %entry + %ext.b = zext i16 %b to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %for.ph ], [ %add, %for.body ] + %gep.a = getelementptr inbounds nuw i16, ptr %a, i64 %iv + %load.a = load i16, ptr %gep.a, align 2 + %ext.a = zext i16 %load.a to i64 + %mul = mul nuw nsw i64 %ext.b, %ext.a + %add = add i64 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %cmp.1 = icmp eq i64 %iv.next, %n + br i1 %cmp.1, label %exit, label %for.body + +exit: ; preds = %for.cond.cleanup.loopexit, %entry + %result = phi i64 [ 0, %entry ], [ %add, %for.body ] + ret i64 %result +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll new file mode 100644 index 0000000000000..f24b115ab9f99 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @not_dotp(ptr %a, ptr %b) { +; CHECK-LABEL: define i32 @not_dotp( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll new file mode 100644 index 0000000000000..5dd9f8ff97cca --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -0,0 +1,94 @@ +; REQUIRES: asserts +; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +; Tests for printing VPlans that are enabled under AArch64 + +define i32 @print_partial_reduction(ptr %a, ptr %b) { +; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<0> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4) +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> +; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> +; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT vp<%bc.resume.val> = resume-phi vp<[[VEC_TC]]>, ir<0> +; CHECK-NEXT: EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 +; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a +; CHECK-NEXT: IR %add = add i32 %mul, %accum +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 0 +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret i32 %add +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll new file mode 100644 index 0000000000000..9f8cf169c0593 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll @@ -0,0 +1,549 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP + +define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { +; IF-EVL-LABEL: define void @first_order_recurrence( +; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] { +; IF-EVL-NEXT: [[ENTRY:.*]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[TC]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP11]] +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP16]], [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 4 +; IF-EVL-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP23]] +; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] +; IF-EVL: [[FOR_BODY]]: +; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP24:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; IF-EVL-NEXT: [[TMP24]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP24]] +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL: [[FOR_END]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @first_order_recurrence( +; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-VP-NEXT: [[ENTRY:.*]]: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NO-VP: [[VECTOR_PH]]: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP8]] +; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]] +; NO-VP: [[VECTOR_BODY]]: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD]] = load , ptr [[TMP11]], align 4 +; NO-VP-NEXT: [[TMP12:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 -1) +; NO-VP-NEXT: [[TMP13:%.*]] = add nsw [[TMP12]], [[WIDE_LOAD]] +; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0 +; NO-VP-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: [[MIDDLE_BLOCK]]: +; NO-VP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 4 +; NO-VP-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP19]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; NO-VP: [[SCALAR_PH]]: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; NO-VP-NEXT: br label %[[FOR_BODY:.*]] +; NO-VP: [[FOR_BODY]]: +; NO-VP-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP20:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; NO-VP-NEXT: [[TMP20]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP20]] +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP: [[FOR_END]]: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ] + %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ] + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %for1, %0 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars + store i32 %add, ptr %arrayidx2, align 4 + %indvars.next = add nuw nsw i64 %indvars, 1 + %exitcond.not = icmp eq i64 %indvars.next, %TC + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { +; IF-EVL-LABEL: define void @second_order_recurrence( +; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[ENTRY:.*]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[TC]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP11]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 +; IF-EVL-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP14]] +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[TMP19]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) +; IF-EVL-NEXT: [[TMP20:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP19]], i32 -1) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP19]], [[TMP20]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP22]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP15]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], 4 +; IF-EVL-NEXT: [[TMP27:%.*]] = sub i32 [[TMP26]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP27]] +; IF-EVL-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 4 +; IF-EVL-NEXT: [[TMP30:%.*]] = sub i32 [[TMP29]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement [[TMP19]], i32 [[TMP30]] +; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] +; IF-EVL: [[FOR_BODY]]: +; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; IF-EVL-NEXT: [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]] +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL: [[FOR_END]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @second_order_recurrence( +; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[ENTRY:.*]]: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NO-VP: [[VECTOR_PH]]: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP8]] +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; NO-VP-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP11]] +; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]] +; NO-VP: [[VECTOR_BODY]]: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP12]] +; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP13]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD]] = load , ptr [[TMP14]], align 4 +; NO-VP-NEXT: [[TMP15]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 -1) +; NO-VP-NEXT: [[TMP16:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP15]], i32 -1) +; NO-VP-NEXT: [[TMP17:%.*]] = add nsw [[TMP15]], [[TMP16]] +; NO-VP-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP12]] +; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 0 +; NO-VP-NEXT: store [[TMP17]], ptr [[TMP19]], align 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; NO-VP: [[MIDDLE_BLOCK]]: +; NO-VP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 4 +; NO-VP-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP23]] +; NO-VP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 4 +; NO-VP-NEXT: [[TMP26:%.*]] = sub i32 [[TMP25]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement [[TMP15]], i32 [[TMP26]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; NO-VP: [[SCALAR_PH]]: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; NO-VP-NEXT: br label %[[FOR_BODY:.*]] +; NO-VP: [[FOR_BODY]]: +; NO-VP-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP27:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; NO-VP-NEXT: [[TMP27]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]] +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; NO-VP: [[FOR_END]]: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ] + %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ] + %for2 = phi i32 [ 22, %entry ], [ %for1, %for.body ] + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %for1, %for2 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars + store i32 %add, ptr %arrayidx2, align 4 + %indvars.next = add nuw nsw i64 %indvars, 1 + %exitcond.not = icmp eq i64 %indvars.next, %TC + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { +; IF-EVL-LABEL: define void @third_order_recurrence( +; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[ENTRY:.*]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[TC]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP11]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 +; IF-EVL-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP14]] +; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4 +; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement poison, i32 11, i32 [[TMP17]] +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR4:%.*]] = phi [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]] +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[TMP22]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) +; IF-EVL-NEXT: [[TMP23]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP22]], i32 -1) +; IF-EVL-NEXT: [[TMP24:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR4]], [[TMP23]], i32 -1) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP23]], [[TMP24]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[VP_OP5:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP]], [[TMP22]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]] +; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP5]], ptr align 4 [[TMP26]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP18]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 4 +; IF-EVL-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP31]] +; IF-EVL-NEXT: [[TMP32:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP33:%.*]] = mul i32 [[TMP32]], 4 +; IF-EVL-NEXT: [[TMP34:%.*]] = sub i32 [[TMP33]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement [[TMP22]], i32 [[TMP34]] +; IF-EVL-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 4 +; IF-EVL-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement [[TMP23]], i32 [[TMP37]] +; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] +; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] +; IF-EVL: [[FOR_BODY]]: +; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; IF-EVL-NEXT: [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]] +; IF-EVL-NEXT: [[ADD1:%.*]] = add i32 [[ADD]], [[FOR1]] +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; IF-EVL-NEXT: store i32 [[ADD1]], ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL: [[FOR_END]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @third_order_recurrence( +; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[ENTRY:.*]]: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NO-VP: [[VECTOR_PH]]: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP8]] +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; NO-VP-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP11]] +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 +; NO-VP-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement poison, i32 11, i32 [[TMP14]] +; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]] +; NO-VP: [[VECTOR_BODY]]: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR4:%.*]] = phi [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP15]] +; NO-VP-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP16]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD]] = load , ptr [[TMP17]], align 4 +; NO-VP-NEXT: [[TMP18]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 -1) +; NO-VP-NEXT: [[TMP19]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP18]], i32 -1) +; NO-VP-NEXT: [[TMP20:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR4]], [[TMP19]], i32 -1) +; NO-VP-NEXT: [[TMP21:%.*]] = add nsw [[TMP19]], [[TMP20]] +; NO-VP-NEXT: [[TMP22:%.*]] = add [[TMP21]], [[TMP18]] +; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP15]] +; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0 +; NO-VP-NEXT: store [[TMP22]], ptr [[TMP24]], align 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP: [[MIDDLE_BLOCK]]: +; NO-VP-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4 +; NO-VP-NEXT: [[TMP28:%.*]] = sub i32 [[TMP27]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP28]] +; NO-VP-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 4 +; NO-VP-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement [[TMP18]], i32 [[TMP31]] +; NO-VP-NEXT: [[TMP32:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP33:%.*]] = mul i32 [[TMP32]], 4 +; NO-VP-NEXT: [[TMP34:%.*]] = sub i32 [[TMP33]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement [[TMP19]], i32 [[TMP34]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; NO-VP: [[SCALAR_PH]]: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT7:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] +; NO-VP-NEXT: br label %[[FOR_BODY:.*]] +; NO-VP: [[FOR_BODY]]: +; NO-VP-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP35:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; NO-VP-NEXT: [[TMP35]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]] +; NO-VP-NEXT: [[ADD1:%.*]] = add i32 [[ADD]], [[FOR1]] +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; NO-VP-NEXT: store i32 [[ADD1]], ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NO-VP: [[FOR_END]]: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ] + %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ] + %for2 = phi i32 [ 22, %entry ], [ %for1, %for.body ] + %for3 = phi i32 [ 11, %entry ], [ %for2, %for.body ] + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %for2, %for3 + %add1 = add i32 %add, %for1 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars + store i32 %add1, ptr %arrayidx2, align 4 + %indvars.next = add nuw nsw i64 %indvars, 1 + %exitcond.not = icmp eq i64 %indvars.next, %TC + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IF-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IF-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +;. +; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; NO-VP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO-VP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; NO-VP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; NO-VP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; NO-VP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; NO-VP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 052b4a10e9c8d..06f0f05889116 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -40,7 +40,9 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[BC_MERGE_RDX1]], 3 +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 @@ -144,7 +146,9 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[BC_MERGE_RDX1]], 2 +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index f9e415a3cefc1..27f3155b50dbb 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -259,11 +259,9 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12) ; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], splat (i8 -1) -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP8]], <8 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP10]], <4 x i8> [[TMP3]], i64 12) ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP11]], <16 x i8> [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP13]] ; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll new file mode 100644 index 0000000000000..07ee8f840721f --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-100 < %s | FileCheck %s + +define i8 @test(ptr %g_127, i32 %0, i16 %1) { +; CHECK-LABEL: define i8 @test( +; CHECK-SAME: ptr [[G_127:%.*]], i32 [[TMP0:%.*]], i16 [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_INC434_I:.*]] +; CHECK: [[FOR_COND166_PREHEADER_I:.*]]: +; CHECK-NEXT: br label %[[FOR_INC434_I]] +; CHECK: [[FOR_INC434_I]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 60, %[[FOR_COND166_PREHEADER_I]] ] +; CHECK-NEXT: [[CONV8_I_I:%.*]] = zext nneg i32 [[TMP0]] to i64 +; CHECK-NEXT: [[DIV_I_I_1:%.*]] = udiv i64 [[CONV8_I_I]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[DIV_I_I_1]] to i16 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> poison, i64 [[CONV8_I_I]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = udiv <4 x i64> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP11]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP12]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> [[TMP14]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = and i16 [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[AND14_I_2_I_5:%.*]] = zext i16 [[OP_RDX]] to i32 +; CHECK-NEXT: store i32 [[AND14_I_2_I_5]], ptr [[G_127]], align 4 +; CHECK-NEXT: ret i8 0 +; +entry: + br label %for.inc434.i + +for.cond166.preheader.i: + br label %for.inc434.i + +for.inc434.i: + %2 = phi i64 [ 0, %entry ], [ 60, %for.cond166.preheader.i ] + %conv8.i.i = zext nneg i32 %0 to i64 + %div.i.i.1 = udiv i64 %conv8.i.i, %2 + %3 = trunc i64 %div.i.i.1 to i16 + %call12.i.2.i.1 = tail call i16 @llvm.bswap.i16(i16 %3) + %and14.i.2.i.118 = and i16 %1, %call12.i.2.i.1 + %div.i.i.2 = udiv i64 %conv8.i.i, %2 + %4 = trunc i64 %div.i.i.2 to i16 + %call12.i.i.2 = tail call i16 @llvm.bswap.i16(i16 %4) + %and14.i.i.219 = and i16 %and14.i.2.i.118, %call12.i.i.2 + %call12.i.2.i.2 = tail call i16 @llvm.bswap.i16(i16 %4) + %and14.i.2.i.220 = and i16 %and14.i.i.219, %call12.i.2.i.2 + %div.i.i.3 = udiv i64 %conv8.i.i, %2 + %5 = trunc i64 %div.i.i.3 to i16 + %call12.i.2.i.3 = tail call i16 @llvm.bswap.i16(i16 %5) + %and14.i.2.i.322 = and i16 %and14.i.2.i.220, %call12.i.2.i.3 + %div.i.i.4 = udiv i64 %conv8.i.i, %2 + %6 = trunc i64 %div.i.i.4 to i16 + %call12.i.i.4 = tail call i16 @llvm.bswap.i16(i16 %6) + %and14.i.i.423 = and i16 %and14.i.2.i.322, %call12.i.i.4 + %call12.i.2.i.4 = tail call i16 @llvm.bswap.i16(i16 %6) + %and14.i.2.i.424 = and i16 %and14.i.i.423, %call12.i.2.i.4 + %div.i.i.5 = udiv i64 %conv8.i.i, %2 + %7 = trunc i64 %div.i.i.5 to i16 + %call12.i.i.5 = tail call i16 @llvm.bswap.i16(i16 %7) + %and14.i.i.525 = and i16 %and14.i.2.i.424, %call12.i.i.5 + %call12.i.2.i.5 = tail call i16 @llvm.bswap.i16(i16 %7) + %and14.i.2.i.51 = and i16 %and14.i.i.525, %call12.i.2.i.5 + %and14.i.2.i.5 = zext i16 %and14.i.2.i.51 to i32 + store i32 %and14.i.2.i.5, ptr %g_127, align 4 + ret i8 0 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll new file mode 100644 index 0000000000000..71390b643f43d --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @test() { +; CHECK-LABEL: define i32 @test() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr null, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[TMP12]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP4]], <4 x i64> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[INC_3_3_I_1:%.*]] = or i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP15]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: ret i32 [[OP_RDX]] +; +entry: + %.pre.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 24), align 8 + %.pre50.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 16), align 16 + %.pre51.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 8), align 8 + %.pre52.i = load i64, ptr null, align 16 + %0 = or i64 %.pre51.i, 0 + %1 = trunc i64 %.pre.i to i32 + %2 = add i32 %1, 0 + %3 = trunc i64 %.pre50.i to i32 + %4 = add i32 %3, 0 + %5 = trunc i64 %.pre51.i to i32 + %6 = add i32 %5, 0 + %7 = trunc i64 0 to i32 + %8 = add i32 %5, 0 + %9 = add i32 %7, 0 + %10 = add i32 %1, 0 + %11 = add i32 %3, 0 + %12 = add i32 %5, 0 + %13 = add i32 %7, 0 + %14 = trunc i64 %.pre.i to i32 + %15 = add i32 %14, 0 + %16 = trunc i64 %.pre50.i to i32 + %17 = add i32 %16, 0 + %18 = trunc i64 %.pre51.i to i32 + %19 = add i32 %18, 0 + %20 = trunc i64 %.pre52.i to i32 + %conv14.1.i = or i32 %9, %13 + %21 = or i32 %conv14.1.i, %6 + %22 = or i32 %21, %8 + %23 = or i32 %22, %12 + %24 = or i32 %23, %4 + %25 = or i32 %24, %11 + %26 = or i32 %25, %2 + %27 = or i32 %26, %10 + %28 = or i32 %27, %15 + %29 = or i32 %28, %17 + %30 = or i32 %29, %19 + %31 = add i32 %14, 0 + %32 = add i32 %16, 0 + %33 = add i32 %18, 0 + %34 = add i32 %20, 0 + %35 = add i32 %14, 0 + %36 = add i32 %16, 0 + %37 = add i32 %18, 0 + %38 = add i32 %20, 0 + %39 = add i32 %14, 0 + %40 = add i32 %16, 0 + %41 = add i32 %18, 0 + %42 = add i32 %20, 0 + %inc.3.3.i.1 = or i64 %.pre52.i, 0 + %conv14.i.1 = or i32 %38, %34 + %conv14.1.i.1 = or i32 %conv14.i.1, %42 + %conv14.3.i.1 = or i32 %conv14.1.i.1, %33 + %conv14.145.i.1 = or i32 %conv14.3.i.1, %37 + %conv14.1.1.i.1 = or i32 %conv14.145.i.1, %41 + %conv14.3.1.i.1 = or i32 %conv14.1.1.i.1, %32 + %conv14.247.i.1 = or i32 %conv14.3.1.i.1, %36 + %conv14.1.2.i.1 = or i32 %conv14.247.i.1, %40 + %conv14.3.2.i.1 = or i32 %conv14.1.2.i.1, %31 + %conv14.349.i.1 = or i32 %conv14.3.2.i.1, %35 + %conv14.1.3.i.1 = or i32 %conv14.349.i.1, %39 + %conv14.3.3.i.1 = or i32 %conv14.1.3.i.1, %30 + ret i32 %conv14.3.3.i.1 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll new file mode 100644 index 0000000000000..7576eb7a8f55e --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-9999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @test(i64 %l.549) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: i64 [[L_549:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CONV3:%.*]] = sext i32 0 to i64 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3 +; CHECK-NEXT: br label %[[IF_THEN19:.*]] +; CHECK: [[P:.*]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: br i1 false, label %[[S:.*]], label %[[Q:.*]] +; CHECK: [[Q]]: +; CHECK-NEXT: [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP3]], i64 0) +; CHECK-NEXT: br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]] +; CHECK: [[LOR_LHS_FALSE]]: +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]] +; CHECK: [[R]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP5]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]] +; CHECK: [[LAND_LHS_TRUE]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i64> [ [[TMP8]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ] +; CHECK-NEXT: br i1 false, label %[[Q]], label %[[S]] +; CHECK: [[S]]: +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP6]], %[[LOR_LHS_FALSE]] ], [ [[TMP2]], %[[P]] ] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: br label %[[IF_THEN19]] +; CHECK: [[IF_THEN19]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11]], %[[S]] ] +; CHECK-NEXT: [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[L_549]], i32 1 +; CHECK-NEXT: [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> zeroinitializer, i64 2) +; CHECK-NEXT: br i1 false, label %[[R]], label %[[IF_END25]] +; CHECK: [[IF_END25]]: +; CHECK-NEXT: br i1 false, label %[[IF_END29]], label %[[P]] +; CHECK: [[IF_END29]]: +; CHECK-NEXT: br label %[[P]] +; +entry: + %conv3 = sext i32 0 to i64 + br label %if.then19 + +p: + %l.0 = phi i64 [ %xor, %if.end29 ], [ %l.5493, %if.end25 ] + %m.0 = phi i64 [ %not21, %if.end29 ], [ %m.550, %if.end25 ] + br i1 false, label %s, label %q + +q: + %xor39 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ] + %l.1 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ] + %m.1 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ] + br i1 false, label %lor.lhs.false, label %r + +lor.lhs.false: + br i1 false, label %land.lhs.true, label %s + +r: + %xor38 = phi i64 [ %xor39, %q ], [ %xor, %if.then19 ] + %j.0 = phi i64 [ %conv3, %q ], [ %not21, %if.then19 ] + %l.2 = phi i64 [ %l.1, %q ], [ %l.549, %if.then19 ] + %m.2 = phi i64 [ %m.1, %q ], [ %m.550, %if.then19 ] + br i1 false, label %s, label %land.lhs.true + +land.lhs.true: + %xor37 = phi i64 [ %xor38, %r ], [ 0, %lor.lhs.false ] + %j.1 = phi i64 [ %j.0, %r ], [ 0, %lor.lhs.false ] + %l.3 = phi i64 [ %l.2, %r ], [ 0, %lor.lhs.false ] + %m.3 = phi i64 [ %m.2, %r ], [ 0, %lor.lhs.false ] + br i1 false, label %q, label %s + +s: + %xor36 = phi i64 [ %xor37, %land.lhs.true ], [ %xor38, %r ], [ %xor39, %lor.lhs.false ], [ %l.0, %p ] + %j.2 = phi i64 [ %j.1, %land.lhs.true ], [ %j.0, %r ], [ %conv3, %lor.lhs.false ], [ %m.0, %p ] + %l.4 = phi i64 [ %l.3, %land.lhs.true ], [ %l.2, %r ], [ %l.1, %lor.lhs.false ], [ %l.0, %p ] + %m.4 = phi i64 [ %m.3, %land.lhs.true ], [ %m.2, %r ], [ %m.1, %lor.lhs.false ], [ %m.0, %p ] + br label %if.then19 + +if.then19: + %m.550 = phi i64 [ 0, %entry ], [ %m.4, %s ] + %l.5493 = phi i64 [ 0, %entry ], [ %l.4, %s ] + %xor = xor i64 0, 0 + %not21 = xor i64 0, 0 + br i1 false, label %r, label %if.end25 + +if.end25: + br i1 false, label %if.end29, label %p + +if.end29: + br label %p +} + diff --git a/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test b/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test index 246337866a777..27f1252b3d5d1 100644 --- a/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test +++ b/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test @@ -2,7 +2,7 @@ # RUN: llvm-objdump -p %t | FileCheck %s # CHECK: Program Header: -# CHECK-NEXT: {{ }}PROPERTY{{ }} +# CHECK-NEXT: {{^}}PROPERTY{{ }} --- !ELF FileHeader: diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp index d78cf485587e1..e9e5b059f1786 100644 --- a/llvm/tools/llvm-objdump/ELFDump.cpp +++ b/llvm/tools/llvm-objdump/ELFDump.cpp @@ -269,7 +269,7 @@ template void ELFDumper::printProgramHeaders() { outs() << " RELRO "; break; case ELF::PT_GNU_PROPERTY: - outs() << " PROPERTY "; + outs() << "PROPERTY "; break; case ELF::PT_GNU_STACK: outs() << " STACK "; diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp index f73f2b20e9fea..f0e34aa273369 100644 --- a/llvm/unittests/IR/AttributesTest.cpp +++ b/llvm/unittests/IR/AttributesTest.cpp @@ -437,6 +437,14 @@ TEST(Attributes, SetIntersect) { break; case Attribute::Range: break; + case Attribute::Captures: + V0 = CaptureInfo(CaptureComponents::AddressIsNull, + CaptureComponents::None) + .toIntValue(); + V1 = CaptureInfo(CaptureComponents::None, + CaptureComponents::ReadProvenance) + .toIntValue(); + break; default: ASSERT_FALSE(true); } @@ -516,6 +524,11 @@ TEST(Attributes, SetIntersect) { ASSERT_EQ(Res->getAttribute(Kind).getRange(), ConstantRange(APInt(32, 0), APInt(32, 20))); break; + case Attribute::Captures: + ASSERT_EQ(Res->getCaptureInfo(), + CaptureInfo(CaptureComponents::AddressIsNull, + CaptureComponents::ReadProvenance)); + break; default: ASSERT_FALSE(true); } diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 3955d36fce896..3a7ea4550d417 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -656,7 +656,7 @@ TEST(ParseArchString, RejectsConflictingExtensions) { for (StringRef Input : {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2", "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcicm0p2", - "rv64i_xqcics0p2", "rv64i_xqcicli0p2"}) { + "rv64i_xqcics0p2", "rv64i_xqcicli0p2", "rv64i_xqciint0p2"}) { EXPECT_THAT( toString(RISCVISAInfo::parseArchString(Input, true).takeError()), ::testing::EndsWith(" is only supported for 'rv32'")); @@ -1121,6 +1121,7 @@ Experimental extensions xqcicm 0.2 xqcics 0.2 xqcicsr 0.2 + xqciint 0.2 xqcilsm 0.2 xqcisls 0.2 diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 1f69190e4bec5..84d9af0ec48f2 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1343,7 +1343,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { AArch64::AEK_FPRCVT, AArch64::AEK_CMPBR, AArch64::AEK_LSUI, AArch64::AEK_OCCMO, AArch64::AEK_PCDPHINT, AArch64::AEK_POPS, - AArch64::AEK_SVEAES}; + AArch64::AEK_SVEAES, AArch64::AEK_SVEBITPERM, + AArch64::AEK_SSVE_BITPERM, + }; std::vector Features; @@ -1382,7 +1384,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { EXPECT_TRUE(llvm::is_contained(Features, "+sve2-aes")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sm4")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sha3")); + EXPECT_TRUE(llvm::is_contained(Features, "+sve-bitperm")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-bitperm")); + EXPECT_TRUE(llvm::is_contained(Features, "+ssve-bitperm")); EXPECT_TRUE(llvm::is_contained(Features, "+sve-aes2")); EXPECT_TRUE(llvm::is_contained(Features, "+ssve-aes")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2p1")); @@ -1554,6 +1558,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) { {"sve2-sha3", "nosve2-sha3", "+sve2-sha3", "-sve2-sha3"}, {"sve2p1", "nosve2p1", "+sve2p1", "-sve2p1"}, {"sve2p2", "nosve2p2", "+sve2p2", "-sve2p2"}, + {"sve-bitperm", "nosve-bitperm", "+sve-bitperm", "-sve-bitperm"}, + {"ssve-bitperm", "nossve-bitperm", "+ssve-bitperm", "-ssve-bitperm"}, {"sve2-bitperm", "nosve2-bitperm", "+sve2-bitperm", "-sve2-bitperm"}, {"sve-aes2", "nosve-aes2", "+sve-aes2", "-sve-aes2"}, {"ssve-aes", "nossve-aes", "+ssve-aes", "-ssve-aes"}, @@ -1754,13 +1760,13 @@ AArch64ExtensionDependenciesBaseArchTestParams // Long dependency chains: sve2-bitperm -> sve2 -> sve -> fp16 -> fp {AArch64::ARMV8A, - {"nofp", "sve2-bitperm"}, - {"fp-armv8", "fullfp16", "sve", "sve2", "sve2-bitperm"}, + {"nofp", "sve2", "sve-bitperm"}, + {"fp-armv8", "fullfp16", "sve", "sve2", "sve-bitperm"}, {}}, {AArch64::ARMV8A, - {"sve2-bitperm", "nofp16"}, + {"sve2", "sve-bitperm", "nofp16"}, {"fp-armv8"}, - {"full-fp16", "sve", "sve2", "sve2-bitperm"}}, + {"full-fp16", "sve", "sve2", "sve-bitperm"}}, // Meaning of +crypto varies with base architecture. {AArch64::ARMV8A, {"crypto"}, {"aes", "sha2"}, {}}, @@ -1803,7 +1809,7 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV9_6A, {"nofp", "fprcvt"}, {"fp-armv8", "fprcvt"}, {}}, {AArch64::ARMV9_6A, {"fprcvt", "nofp"}, {}, {"fp-armv8", "fprcvt"}}, - // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm} + // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, fp8dot4, fp8dot2} {AArch64::ARMV8A, {"nosimd", "aes"}, {"neon", "aes"}, {}}, {AArch64::ARMV8A, {"aes", "nosimd"}, {}, {"neon", "aes"}}, {AArch64::ARMV8A, {"nosimd", "sha2"}, {"neon", "sha2"}, {}}, @@ -1816,6 +1822,10 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV9_6A, {"f8f16mm", "nosimd"}, {}, {"neon", "f8f16mm"}}, {AArch64::ARMV9_6A, {"nosimd", "f8f32mm"}, {"neon", "f8f32mm"}, {}}, {AArch64::ARMV9_6A, {"f8f32mm", "nosimd"}, {}, {"neon", "f8f32mm"}}, + {AArch64::ARMV9_6A, {"nosimd", "fp8dot4"}, {"neon", "fp8dot4"}, {}}, + {AArch64::ARMV9_6A, {"fp8dot4", "nosimd"}, {}, {"neon", "fp8dot4"}}, + {AArch64::ARMV9_6A, {"nosimd", "fp8dot2"}, {"neon", "fp8dot2"}, {}}, + {AArch64::ARMV9_6A, {"fp8dot2", "nosimd"}, {}, {"neon", "fp8dot2"}}, // simd -> {rdm, dotprod, fcma} {AArch64::ARMV8A, {"nosimd", "rdm"}, {"neon", "rdm"}, {}}, @@ -1860,12 +1870,20 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV8A, {"sve2p1", "nosve2"}, {}, {"sve2", "sve2p1"}}, {AArch64::ARMV8A, {"nosve2", "sve2-bitperm"}, - {"sve2", "sve2-bitperm"}, + {"sve2", "sve-bitperm"}, {}}, {AArch64::ARMV8A, {"sve2-bitperm", "nosve2"}, - {}, - {"sve2", "sve2-bitperm"}}, + {"sve"}, + {"sve-bitperm", "sve2", "sve2-bitperm"}}, + {AArch64::ARMV8A, + {"ssve-bitperm", "nosve-bitperm"}, + {"sme"}, + {"ssve-bitperm", "sve-bitperm"}}, + {AArch64::ARMV8A, + {"nosve-bitperm", "ssve-bitperm"}, + {"sve-bitperm", "sve-bitperm"}, + {""}}, {AArch64::ARMV8A, {"nosve2", "sve2-sha3"}, {"sve2", "sve2-sha3"}, {}}, {AArch64::ARMV8A, {"sve2-sha3", "nosve2"}, {}, {"sve2", "sve2-sha3"}}, {AArch64::ARMV8A, {"nosve2", "sve2-sm4"}, {"sve2", "sve2-sm4"}, {}}, @@ -1940,7 +1958,8 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV9_6A, {"nosme2p1", "sme2p2"}, {"sme2p2", "sme2p1"}, {}}, {AArch64::ARMV9_6A, {"sme2p2", "nosme2p1"}, {}, {"sme2p1", "sme2p2"}}, - // fp8 -> {sme-f8f16, sme-f8f32, f8f16mm, f8f32mm} + // fp8 -> {sme-f8f16, sme-f8f32, f8f16mm, f8f32mm, fp8dot4, fp8dot2, + // ssve-fp8dot4, ssve-fp8dot2} {AArch64::ARMV8A, {"nofp8", "sme-f8f16"}, {"fp8", "sme-f8f16"}, {}}, {AArch64::ARMV8A, {"sme-f8f16", "nofp8"}, {}, {"fp8", "sme-f8f16"}}, {AArch64::ARMV8A, {"nofp8", "sme-f8f32"}, {"fp8", "sme-f8f32"}, {}}, @@ -1949,6 +1968,26 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV9_6A, {"f8f16mm", "nofp8"}, {}, {"fp8", "f8f16mm"}}, {AArch64::ARMV9_6A, {"nofp8", "f8f32mm"}, {"fp8", "f8f32mm"}, {}}, {AArch64::ARMV9_6A, {"f8f32mm", "nofp8"}, {}, {"fp8", "f8f32mm"}}, + {AArch64::ARMV9_6A, {"nofp8", "fp8dot4"}, {"fp8", "fp8dot4"}, {}}, + {AArch64::ARMV9_6A, {"fp8dot4", "nofp8"}, {}, {"fp8", "fp8dot4"}}, + {AArch64::ARMV9_6A, {"nofp8", "fp8dot2"}, {"fp8", "fp8dot2"}, {}}, + {AArch64::ARMV9_6A, {"fp8dot2", "nofp8"}, {}, {"fp8", "fp8dot2"}}, + {AArch64::ARMV9_6A, + {"nofp8", "ssve-fp8dot4"}, + {"fp8", "ssve-fp8dot4"}, + {}}, + {AArch64::ARMV9_6A, + {"ssve-fp8dot4", "nofp8"}, + {}, + {"fp8", "ssve-fp8dot4"}}, + {AArch64::ARMV9_6A, + {"nofp8", "ssve-fp8dot2"}, + {"fp8", "ssve-fp8dot2"}, + {}}, + {AArch64::ARMV9_6A, + {"ssve-fp8dot2", "nofp8"}, + {}, + {"fp8", "ssve-fp8dot2"}}, // lse -> lse128 {AArch64::ARMV8A, {"nolse", "lse128"}, {"lse", "lse128"}, {}}, @@ -2015,10 +2054,10 @@ AArch64ExtensionDependenciesBaseCPUTestParams {}}, {"cortex-a520", {}, - {"v9.2a", "bf16", "crc", "dotprod", "flagm", "fp-armv8", - "fullfp16", "fp16fml", "i8mm", "lse", "mte", "pauth", - "perfmon", "predres", "ras", "rcpc", "rdm", "sb", - "neon", "ssbs", "sve", "sve2-bitperm", "sve2"}, + {"v9.2a", "bf16", "crc", "dotprod", "flagm", "fp-armv8", + "fullfp16", "fp16fml", "i8mm", "lse", "mte", "pauth", + "perfmon", "predres", "ras", "rcpc", "rdm", "sb", + "neon", "ssbs", "sve", "sve-bitperm", "sve2"}, {}}, // Negative modifiers @@ -2033,4 +2072,4 @@ INSTANTIATE_TEST_SUITE_P( AArch64ExtensionDependenciesBaseCPUTestFixture, ::testing::ValuesIn(AArch64ExtensionDependenciesCPUData)); -} // namespace \ No newline at end of file +} // namespace diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 7f4230affca09..1fe322c88bb0f 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -415,9 +415,9 @@ void CodeGenSchedModels::collectSTIPredicates() { for (const Record *R : Records.getAllDerivedDefinitions("STIPredicate")) { const Record *Decl = R->getValueAsDef("Declaration"); - const auto [It, Inserted] = - Decl2Index.try_emplace(Decl, STIPredicates.size()); - if (Inserted) { + const auto It = Decl2Index.find(Decl); + if (It == Decl2Index.end()) { + Decl2Index[Decl] = STIPredicates.size(); STIPredicateFunction Predicate(Decl); Predicate.addDefinition(R); STIPredicates.emplace_back(std::move(Predicate)); diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index c6cd3da13646a..607a6bd27c21f 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -1147,19 +1147,16 @@ OperandType RecognizableInstr::typeFromString(const std::string &s, TYPE("VK4Pair", TYPE_VK_PAIR) TYPE("VK8Pair", TYPE_VK_PAIR) TYPE("VK16Pair", TYPE_VK_PAIR) + TYPE("vx32mem", TYPE_MVSIBX) TYPE("vx64mem", TYPE_MVSIBX) - TYPE("vx128mem", TYPE_MVSIBX) - TYPE("vx256mem", TYPE_MVSIBX) - TYPE("vy128mem", TYPE_MVSIBY) - TYPE("vy256mem", TYPE_MVSIBY) + TYPE("vy32mem", TYPE_MVSIBY) + TYPE("vy64mem", TYPE_MVSIBY) + TYPE("vx32xmem", TYPE_MVSIBX) TYPE("vx64xmem", TYPE_MVSIBX) - TYPE("vx128xmem", TYPE_MVSIBX) - TYPE("vx256xmem", TYPE_MVSIBX) - TYPE("vy128xmem", TYPE_MVSIBY) - TYPE("vy256xmem", TYPE_MVSIBY) - TYPE("vy512xmem", TYPE_MVSIBY) - TYPE("vz256mem", TYPE_MVSIBZ) - TYPE("vz512mem", TYPE_MVSIBZ) + TYPE("vy32xmem", TYPE_MVSIBY) + TYPE("vy64xmem", TYPE_MVSIBY) + TYPE("vz32mem", TYPE_MVSIBZ) + TYPE("vz64mem", TYPE_MVSIBZ) TYPE("BNDR", TYPE_BNDR) TYPE("TILE", TYPE_TMM) TYPE("TILEPair", TYPE_TMM_PAIR) @@ -1372,19 +1369,16 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s, ENCODING("anymem", ENCODING_RM) ENCODING("opaquemem", ENCODING_RM) ENCODING("sibmem", ENCODING_SIB) + ENCODING("vx32mem", ENCODING_VSIB) ENCODING("vx64mem", ENCODING_VSIB) - ENCODING("vx128mem", ENCODING_VSIB) - ENCODING("vx256mem", ENCODING_VSIB) - ENCODING("vy128mem", ENCODING_VSIB) - ENCODING("vy256mem", ENCODING_VSIB) + ENCODING("vy32mem", ENCODING_VSIB) + ENCODING("vy64mem", ENCODING_VSIB) + ENCODING("vx32xmem", ENCODING_VSIB) ENCODING("vx64xmem", ENCODING_VSIB) - ENCODING("vx128xmem", ENCODING_VSIB) - ENCODING("vx256xmem", ENCODING_VSIB) - ENCODING("vy128xmem", ENCODING_VSIB) - ENCODING("vy256xmem", ENCODING_VSIB) - ENCODING("vy512xmem", ENCODING_VSIB) - ENCODING("vz256mem", ENCODING_VSIB) - ENCODING("vz512mem", ENCODING_VSIB) + ENCODING("vy32xmem", ENCODING_VSIB) + ENCODING("vy64xmem", ENCODING_VSIB) + ENCODING("vz32mem", ENCODING_VSIB) + ENCODING("vz64mem", ENCODING_VSIB) errs() << "Unhandled memory encoding " << s << "\n"; llvm_unreachable("Unhandled memory encoding"); } diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index e1cc02e1a608c..1a875c2b523e4 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -557,6 +557,10 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False): UTC_AVOID = "NOTE: Do not autogenerate" UNUSED_NOTE = "NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:" +DATA_LAYOUT_RE = re.compile( + r"target\s+datalayout\s+=\s+\"(?P.+)\"$", flags=(re.M | re.S) +) + OPT_FUNCTION_RE = re.compile( r"^(\s*;\s*Function\sAttrs:\s(?P[\w\s():,]+?))?\s*define\s+(?P[^@]*)@(?P[\w.$-]+?)\s*" r"(?P\((\)|(.*?[\w.-]+?)\))[^{]*\{)\n(?P.*?)^\}$", @@ -651,6 +655,18 @@ def get_triple_from_march(march): return "x86" +def get_globals_name_prefix(raw_tool_output): + m = DATA_LAYOUT_RE.search(raw_tool_output) + if not m: + return None + data_layout = m.group("layout") + idx = data_layout.find("m:") + if idx < 0: + return None + ch = data_layout[idx + 2] + return "_" if ch == "o" or ch == "x" else None + + def apply_filters(line, filters): has_filter = False for f in filters: diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 83c0811b6814a..639095b698c6f 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1462,7 +1462,6 @@ if (current_toolchain == default_toolchain) { "__locale_dir/locale_base_api.h", "__locale_dir/locale_base_api/android.h", "__locale_dir/locale_base_api/bsd_locale_fallbacks.h", - "__locale_dir/locale_base_api/fuchsia.h", "__locale_dir/locale_base_api/ibm.h", "__locale_dir/locale_base_api/musl.h", "__locale_dir/locale_base_api/openbsd.h", @@ -1470,6 +1469,9 @@ if (current_toolchain == default_toolchain) { "__locale_dir/support/apple.h", "__locale_dir/support/bsd_like.h", "__locale_dir/support/freebsd.h", + "__locale_dir/support/fuchsia.h", + "__locale_dir/support/no_locale/characters.h", + "__locale_dir/support/no_locale/strtonum.h", "__locale_dir/support/windows.h", "__math/abs.h", "__math/copysign.h", @@ -1510,7 +1512,6 @@ if (current_toolchain == default_toolchain) { "__memory/array_cookie.h", "__memory/assume_aligned.h", "__memory/auto_ptr.h", - "__memory/builtin_new_allocator.h", "__memory/compressed_pair.h", "__memory/concepts.h", "__memory/construct_at.h", @@ -1843,6 +1844,7 @@ if (current_toolchain == default_toolchain) { "__utility/cmp.h", "__utility/convert_to_integral.h", "__utility/declval.h", + "__utility/element_count.h", "__utility/empty.h", "__utility/exception_guard.h", "__utility/exchange.h", diff --git a/llvm/utils/update_cc_test_checks.py b/llvm/utils/update_cc_test_checks.py index 3ffb07ddf6ad8..7a4796eaabb3b 100755 --- a/llvm/utils/update_cc_test_checks.py +++ b/llvm/utils/update_cc_test_checks.py @@ -34,7 +34,7 @@ } -def get_line2func_list(args, clang_args): +def get_line2func_list(args, clang_args, globals_name_prefix): ret = collections.defaultdict(list) # Use clang's JSON AST dump to get the mangled name json_dump_args = [args.clang] + clang_args + ["-fsyntax-only", "-o", "-"] @@ -122,6 +122,14 @@ def parse_clang_ast_json(node, loc, search): if search is None: search = spell mangled = node.get("mangledName", spell) + # Clang's AST dump includes the globals prefix, but when Clang emits + # LLVM IR this is not included and instead added as part of the asm + # output. Strip it from the mangled name of globals when needed + # (see DataLayout::getGlobalPrefix()). + if globals_name_prefix: + storage = node.get("storageClass", None) + if storage != "static" and mangled[0] == globals_name_prefix: + mangled = mangled[1:] ret[int(line) - 1].append((spell, mangled, search)) ast = json.loads(stdout) @@ -249,10 +257,10 @@ def config(): return args, parser -def get_function_body(builder, args, filename, clang_args, extra_commands, prefixes): +def get_function_body( + builder, args, filename, clang_args, extra_commands, prefixes, raw_tool_output +): # TODO Clean up duplication of asm/common build_function_body_dictionary - # Invoke external tool and extract function bodies. - raw_tool_output = common.invoke_tool(args.clang, clang_args, filename) for extra_command in extra_commands: extra_args = shlex.split(extra_command) with tempfile.NamedTemporaryFile() as f: @@ -383,13 +391,23 @@ def main(): common.debug("Extracted clang cmd: clang {}".format(clang_args)) common.debug("Extracted FileCheck prefixes: {}".format(prefixes)) + # Invoke external tool and extract function bodies. + raw_tool_output = common.invoke_tool(ti.args.clang, clang_args, ti.path) get_function_body( - builder, ti.args, ti.path, clang_args, extra_commands, prefixes + builder, + ti.args, + ti.path, + clang_args, + extra_commands, + prefixes, + raw_tool_output, ) # Invoke clang -Xclang -ast-dump=json to get mapping from start lines to # mangled names. Forward all clang args for now. - for k, v in get_line2func_list(ti.args, clang_args).items(): + for k, v in get_line2func_list( + ti.args, clang_args, common.get_globals_name_prefix(raw_tool_output) + ).items(): line2func_list[k].extend(v) func_dict = builder.finish_and_get_func_dict() diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake index 717a503468a85..0679db9cf93e1 100644 --- a/mlir/cmake/modules/AddMLIRPython.cmake +++ b/mlir/cmake/modules/AddMLIRPython.cmake @@ -668,12 +668,31 @@ function(add_mlir_python_extension libname extname) elseif(ARG_PYTHON_BINDINGS_LIBRARY STREQUAL "nanobind") nanobind_add_module(${libname} NB_DOMAIN mlir + FREE_THREADED ${ARG_SOURCES} ) if (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL) # Avoids warnings from upstream nanobind. - target_compile_options(nanobind-static + set(nanobind_target "nanobind-static") + if (NOT TARGET ${nanobind_target}) + # Get correct nanobind target name: nanobind-static-ft or something else + # It is set by nanobind_add_module function according to the passed options + get_property(all_targets DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY BUILDSYSTEM_TARGETS) + + # Iterate over the list of targets + foreach(target ${all_targets}) + # Check if the target name matches the given string + if("${target}" MATCHES "nanobind-") + set(nanobind_target "${target}") + endif() + endforeach() + + if (NOT TARGET ${nanobind_target}) + message(FATAL_ERROR "Could not find nanobind target to set compile options to") + endif() + endif() + target_compile_options(${nanobind_target} PRIVATE -Wno-cast-qual -Wno-zero-length-array diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md index 32df3310d811d..b8bd0f507a510 100644 --- a/mlir/docs/Bindings/Python.md +++ b/mlir/docs/Bindings/Python.md @@ -1187,3 +1187,43 @@ or nanobind and utilities to connect to the rest of Python API. The bindings can be located in a separate module or in the same module as attributes and types, and loaded along with the dialect. + +## Free-threading (No-GIL) support + +Free-threading or no-GIL support refers to CPython interpreter (>=3.13) with Global Interpreter Lock made optional. For details on the topic, please check [PEP-703](https://peps.python.org/pep-0703/) and this [Python free-threading guide](https://py-free-threading.github.io/). + +MLIR Python bindings are free-threading compatible with exceptions (discussed below) in the following sense: it is safe to work in multiple threads with **independent** contexts. Below we show an example code of safe usage: + +```python +# python3.13t example.py +import concurrent.futures + +import mlir.dialects.arith as arith +from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint + + +def func(py_value): + with Context() as ctx: + module = Module.create(loc=Location.file("foo.txt", 0, 0)) + + dtype = IntegerType.get_signless(64) + with InsertionPoint(module.body), Location.name("a"): + arith.constant(dtype, py_value) + + return module + + +num_workers = 8 +with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + for i in range(num_workers): + futures.append(executor.submit(func, i)) + assert len(list(f.result() for f in futures)) == num_workers +``` + +The exceptions to the free-threading compatibility: +- IR printing is unsafe, e.g. when using `PassManager` with `PassManager.enable_ir_printing()` which calls thread-unsafe `llvm::raw_ostream`. +- Usage of `Location.emit_error` is unsafe (due to thread-unsafe `llvm::raw_ostream`). +- Usage of `Module.dump` is unsafe (due to thread-unsafe `llvm::raw_ostream`). +- Usage of `mlir.dialects.transform.interpreter` is unsafe. +- Usage of `mlir.dialects.gpu` and `gpu-module-to-binary` is unsafe. \ No newline at end of file diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 0b9097e9bbca2..04042903e343e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -1699,8 +1699,8 @@ def NVVM_MmaOp : NVVM_Op<"mma.sync", [AttrSizedOperandSegments]> { | f16 | .m8n8k4 | row/col | row/col | 2x f16x2 | 2x f16x2 | 4x f16x2 or 8xf32 | | | .m16n8k8 | row | col | 2x f16x2 | 1x f16x2 | 2x f16x2 or 4 f32 | | | .m16n8k16 | row | col | 4x f16x2 | 2x f16x2 | 2x f16x2 or 4 f32 | - | bf16 | .m16n8k8 | row | col | 2x f16x2 | 1x f16x2 | 2x f16x2 or 4 f32 | - | | .m16n8k16 | row | col | 4x f16x2 | 2x f16x2 | 2x f16x2 or 4 f32 | + | bf16 | .m16n8k8 | row | col | 2x i32 | 1x i32 | 4x f32 | + | | .m16n8k16 | row | col | 4x i32 | 2x i32 | 4x f32 | | tf32 | .m16n8k4 | row | col | 2x i32 | 1x i32 | 4x f32 | | | .m16n8k8 | row | col | 4x i32 | 2x i32 | 2x f16x2 or 4 f32 | | u8/s8 | .m8n8k16 | row | col | 1x i32 | 1x i32 | 2x i32 | diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h index e99b0476a6b10..69cc2e32285b6 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h +++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h @@ -23,6 +23,13 @@ namespace LLVM { void registerInlinerInterface(DialectRegistry ®istry); } // namespace LLVM + +namespace NVVM { +/// Register the `NVVMInlinerInterface` implementation of +/// `DialectInlinerInterface` with the NVVM dialect. +void registerInlinerInterface(DialectRegistry ®istry); +} // namespace NVVM + } // namespace mlir #endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_INLINERINTERFACEIMPL_H diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h index c102f811cce4b..0da82825c8287 100644 --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -167,6 +167,7 @@ inline void registerAllDialects(DialectRegistry ®istry) { gpu::registerBufferDeallocationOpInterfaceExternalModels(registry); gpu::registerValueBoundsOpInterfaceExternalModels(registry); LLVM::registerInlinerInterface(registry); + NVVM::registerInlinerInterface(registry); linalg::registerAllDialectInterfaceImplementations(registry); linalg::registerRuntimeVerifiableOpInterfaceExternalModels(registry); memref::registerAllocationOpInterfaceExternalModels(registry); diff --git a/mlir/include/mlir/Transforms/InliningUtils.h b/mlir/include/mlir/Transforms/InliningUtils.h index 88fc033a6ab7b..becfe9b047ef4 100644 --- a/mlir/include/mlir/Transforms/InliningUtils.h +++ b/mlir/include/mlir/Transforms/InliningUtils.h @@ -176,6 +176,13 @@ class DialectInlinerInterface /// is invoked before inlined terminator operations have been processed. virtual void processInlinedCallBlocks( Operation *call, iterator_range inlinedBlocks) const {} + + /// Returns true if the inliner can assume a fast path of not creating a new + /// block, if there is only one block. + virtual bool allowSingleBlockOptimization( + iterator_range inlinedBlocks) const { + return true; + } }; /// This interface provides the hooks into the inlining interface. @@ -223,6 +230,9 @@ class InlinerInterface virtual void processInlinedCallBlocks( Operation *call, iterator_range inlinedBlocks) const; + + virtual bool allowSingleBlockOptimization( + iterator_range inlinedBlocks) const; }; //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Bindings/Python/Globals.h b/mlir/lib/Bindings/Python/Globals.h index 0ec522d14f74b..826a34a535176 100644 --- a/mlir/lib/Bindings/Python/Globals.h +++ b/mlir/lib/Bindings/Python/Globals.h @@ -24,6 +24,7 @@ namespace mlir { namespace python { /// Globals that are always accessible once the extension has been initialized. +/// Methods of this class are thread-safe. class PyGlobals { public: PyGlobals(); @@ -37,12 +38,18 @@ class PyGlobals { /// Get and set the list of parent modules to search for dialect /// implementation classes. - std::vector &getDialectSearchPrefixes() { + std::vector getDialectSearchPrefixes() { + nanobind::ft_lock_guard lock(mutex); return dialectSearchPrefixes; } void setDialectSearchPrefixes(std::vector newValues) { + nanobind::ft_lock_guard lock(mutex); dialectSearchPrefixes.swap(newValues); } + void addDialectSearchPrefix(std::string value) { + nanobind::ft_lock_guard lock(mutex); + dialectSearchPrefixes.push_back(std::move(value)); + } /// Loads a python module corresponding to the given dialect namespace. /// No-ops if the module has already been loaded or is not found. Raises @@ -109,6 +116,9 @@ class PyGlobals { private: static PyGlobals *instance; + + nanobind::ft_mutex mutex; + /// Module name prefixes to search under for dialect implementation modules. std::vector dialectSearchPrefixes; /// Map of dialect namespace to external dialect class object. diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 453d4f7c7e8bc..53806ca9f04a4 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -243,9 +243,15 @@ static MlirBlock createBlock(const nb::sequence &pyArgTypes, /// Wrapper for the global LLVM debugging flag. struct PyGlobalDebugFlag { - static void set(nb::object &o, bool enable) { mlirEnableGlobalDebug(enable); } + static void set(nb::object &o, bool enable) { + nb::ft_lock_guard lock(mutex); + mlirEnableGlobalDebug(enable); + } - static bool get(const nb::object &) { return mlirIsGlobalDebugEnabled(); } + static bool get(const nb::object &) { + nb::ft_lock_guard lock(mutex); + return mlirIsGlobalDebugEnabled(); + } static void bind(nb::module_ &m) { // Debug flags. @@ -255,6 +261,7 @@ struct PyGlobalDebugFlag { .def_static( "set_types", [](const std::string &type) { + nb::ft_lock_guard lock(mutex); mlirSetGlobalDebugType(type.c_str()); }, "types"_a, "Sets specific debug types to be produced by LLVM") @@ -263,11 +270,17 @@ struct PyGlobalDebugFlag { pointers.reserve(types.size()); for (const std::string &str : types) pointers.push_back(str.c_str()); + nb::ft_lock_guard lock(mutex); mlirSetGlobalDebugTypes(pointers.data(), pointers.size()); }); } + +private: + static nb::ft_mutex mutex; }; +nb::ft_mutex PyGlobalDebugFlag::mutex; + struct PyAttrBuilderMap { static bool dunderContains(const std::string &attributeKind) { return PyGlobals::get().lookupAttributeBuilder(attributeKind).has_value(); @@ -606,6 +619,7 @@ class PyOpOperandIterator { PyMlirContext::PyMlirContext(MlirContext context) : context(context) { nb::gil_scoped_acquire acquire; + nb::ft_lock_guard lock(live_contexts_mutex); auto &liveContexts = getLiveContexts(); liveContexts[context.ptr] = this; } @@ -615,7 +629,10 @@ PyMlirContext::~PyMlirContext() { // forContext method, which always puts the associated handle into // liveContexts. nb::gil_scoped_acquire acquire; - getLiveContexts().erase(context.ptr); + { + nb::ft_lock_guard lock(live_contexts_mutex); + getLiveContexts().erase(context.ptr); + } mlirContextDestroy(context); } @@ -632,6 +649,7 @@ nb::object PyMlirContext::createFromCapsule(nb::object capsule) { PyMlirContextRef PyMlirContext::forContext(MlirContext context) { nb::gil_scoped_acquire acquire; + nb::ft_lock_guard lock(live_contexts_mutex); auto &liveContexts = getLiveContexts(); auto it = liveContexts.find(context.ptr); if (it == liveContexts.end()) { @@ -647,36 +665,56 @@ PyMlirContextRef PyMlirContext::forContext(MlirContext context) { return PyMlirContextRef(it->second, std::move(pyRef)); } +nb::ft_mutex PyMlirContext::live_contexts_mutex; + PyMlirContext::LiveContextMap &PyMlirContext::getLiveContexts() { static LiveContextMap liveContexts; return liveContexts; } -size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); } +size_t PyMlirContext::getLiveCount() { + nb::ft_lock_guard lock(live_contexts_mutex); + return getLiveContexts().size(); +} -size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); } +size_t PyMlirContext::getLiveOperationCount() { + nb::ft_lock_guard lock(liveOperationsMutex); + return liveOperations.size(); +} std::vector PyMlirContext::getLiveOperationObjects() { std::vector liveObjects; + nb::ft_lock_guard lock(liveOperationsMutex); for (auto &entry : liveOperations) liveObjects.push_back(entry.second.second); return liveObjects; } size_t PyMlirContext::clearLiveOperations() { - for (auto &op : liveOperations) + + LiveOperationMap operations; + { + nb::ft_lock_guard lock(liveOperationsMutex); + std::swap(operations, liveOperations); + } + for (auto &op : operations) op.second.second->setInvalid(); - size_t numInvalidated = liveOperations.size(); - liveOperations.clear(); + size_t numInvalidated = operations.size(); return numInvalidated; } void PyMlirContext::clearOperation(MlirOperation op) { - auto it = liveOperations.find(op.ptr); - if (it != liveOperations.end()) { - it->second.second->setInvalid(); + PyOperation *py_op; + { + nb::ft_lock_guard lock(liveOperationsMutex); + auto it = liveOperations.find(op.ptr); + if (it == liveOperations.end()) { + return; + } + py_op = it->second.second; liveOperations.erase(it); } + py_op->setInvalid(); } void PyMlirContext::clearOperationsInside(PyOperationBase &op) { @@ -1160,7 +1198,6 @@ PyOperation::~PyOperation() { PyOperationRef PyOperation::createInstance(PyMlirContextRef contextRef, MlirOperation operation, nb::object parentKeepAlive) { - auto &liveOperations = contextRef->liveOperations; // Create. PyOperation *unownedOperation = new PyOperation(std::move(contextRef), operation); @@ -1172,19 +1209,22 @@ PyOperationRef PyOperation::createInstance(PyMlirContextRef contextRef, if (parentKeepAlive) { unownedOperation->parentKeepAlive = std::move(parentKeepAlive); } - liveOperations[operation.ptr] = std::make_pair(pyRef, unownedOperation); return PyOperationRef(unownedOperation, std::move(pyRef)); } PyOperationRef PyOperation::forOperation(PyMlirContextRef contextRef, MlirOperation operation, nb::object parentKeepAlive) { + nb::ft_lock_guard lock(contextRef->liveOperationsMutex); auto &liveOperations = contextRef->liveOperations; auto it = liveOperations.find(operation.ptr); if (it == liveOperations.end()) { // Create. - return createInstance(std::move(contextRef), operation, - std::move(parentKeepAlive)); + PyOperationRef result = createInstance(std::move(contextRef), operation, + std::move(parentKeepAlive)); + liveOperations[operation.ptr] = + std::make_pair(result.getObject(), result.get()); + return result; } // Use existing. PyOperation *existing = it->second.second; @@ -1195,13 +1235,15 @@ PyOperationRef PyOperation::forOperation(PyMlirContextRef contextRef, PyOperationRef PyOperation::createDetached(PyMlirContextRef contextRef, MlirOperation operation, nb::object parentKeepAlive) { + nb::ft_lock_guard lock(contextRef->liveOperationsMutex); auto &liveOperations = contextRef->liveOperations; assert(liveOperations.count(operation.ptr) == 0 && "cannot create detached operation that already exists"); (void)liveOperations; - PyOperationRef created = createInstance(std::move(contextRef), operation, std::move(parentKeepAlive)); + liveOperations[operation.ptr] = + std::make_pair(created.getObject(), created.get()); created->attached = false; return created; } diff --git a/mlir/lib/Bindings/Python/IRModule.cpp b/mlir/lib/Bindings/Python/IRModule.cpp index f7bf77e5a7e04..e600f1bbd4493 100644 --- a/mlir/lib/Bindings/Python/IRModule.cpp +++ b/mlir/lib/Bindings/Python/IRModule.cpp @@ -38,8 +38,11 @@ PyGlobals::PyGlobals() { PyGlobals::~PyGlobals() { instance = nullptr; } bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) { - if (loadedDialectModules.contains(dialectNamespace)) - return true; + { + nb::ft_lock_guard lock(mutex); + if (loadedDialectModules.contains(dialectNamespace)) + return true; + } // Since re-entrancy is possible, make a copy of the search prefixes. std::vector localSearchPrefixes = dialectSearchPrefixes; nb::object loaded = nb::none(); @@ -62,12 +65,14 @@ bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) { return false; // Note: Iterator cannot be shared from prior to loading, since re-entrancy // may have occurred, which may do anything. + nb::ft_lock_guard lock(mutex); loadedDialectModules.insert(dialectNamespace); return true; } void PyGlobals::registerAttributeBuilder(const std::string &attributeKind, nb::callable pyFunc, bool replace) { + nb::ft_lock_guard lock(mutex); nb::object &found = attributeBuilderMap[attributeKind]; if (found && !replace) { throw std::runtime_error((llvm::Twine("Attribute builder for '") + @@ -81,6 +86,7 @@ void PyGlobals::registerAttributeBuilder(const std::string &attributeKind, void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID, nb::callable typeCaster, bool replace) { + nb::ft_lock_guard lock(mutex); nb::object &found = typeCasterMap[mlirTypeID]; if (found && !replace) throw std::runtime_error("Type caster is already registered with caster: " + @@ -90,6 +96,7 @@ void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID, void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID, nb::callable valueCaster, bool replace) { + nb::ft_lock_guard lock(mutex); nb::object &found = valueCasterMap[mlirTypeID]; if (found && !replace) throw std::runtime_error("Value caster is already registered: " + @@ -99,6 +106,7 @@ void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID, void PyGlobals::registerDialectImpl(const std::string &dialectNamespace, nb::object pyClass) { + nb::ft_lock_guard lock(mutex); nb::object &found = dialectClassMap[dialectNamespace]; if (found) { throw std::runtime_error((llvm::Twine("Dialect namespace '") + @@ -110,6 +118,7 @@ void PyGlobals::registerDialectImpl(const std::string &dialectNamespace, void PyGlobals::registerOperationImpl(const std::string &operationName, nb::object pyClass, bool replace) { + nb::ft_lock_guard lock(mutex); nb::object &found = operationClassMap[operationName]; if (found && !replace) { throw std::runtime_error((llvm::Twine("Operation '") + operationName + @@ -121,6 +130,7 @@ void PyGlobals::registerOperationImpl(const std::string &operationName, std::optional PyGlobals::lookupAttributeBuilder(const std::string &attributeKind) { + nb::ft_lock_guard lock(mutex); const auto foundIt = attributeBuilderMap.find(attributeKind); if (foundIt != attributeBuilderMap.end()) { assert(foundIt->second && "attribute builder is defined"); @@ -133,6 +143,7 @@ std::optional PyGlobals::lookupTypeCaster(MlirTypeID mlirTypeID, MlirDialect dialect) { // Try to load dialect module. (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect))); + nb::ft_lock_guard lock(mutex); const auto foundIt = typeCasterMap.find(mlirTypeID); if (foundIt != typeCasterMap.end()) { assert(foundIt->second && "type caster is defined"); @@ -145,6 +156,7 @@ std::optional PyGlobals::lookupValueCaster(MlirTypeID mlirTypeID, MlirDialect dialect) { // Try to load dialect module. (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect))); + nb::ft_lock_guard lock(mutex); const auto foundIt = valueCasterMap.find(mlirTypeID); if (foundIt != valueCasterMap.end()) { assert(foundIt->second && "value caster is defined"); @@ -158,6 +170,7 @@ PyGlobals::lookupDialectClass(const std::string &dialectNamespace) { // Make sure dialect module is loaded. if (!loadDialectModule(dialectNamespace)) return std::nullopt; + nb::ft_lock_guard lock(mutex); const auto foundIt = dialectClassMap.find(dialectNamespace); if (foundIt != dialectClassMap.end()) { assert(foundIt->second && "dialect class is defined"); @@ -175,6 +188,7 @@ PyGlobals::lookupOperationClass(llvm::StringRef operationName) { if (!loadDialectModule(dialectNamespace)) return std::nullopt; + nb::ft_lock_guard lock(mutex); auto foundIt = operationClassMap.find(operationName); if (foundIt != operationClassMap.end()) { assert(foundIt->second && "OpView is defined"); diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 8fb32a225e65f..d1fb4308dbb77 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -260,6 +260,7 @@ class PyMlirContext { // Note that this holds a handle, which does not imply ownership. // Mappings will be removed when the context is destructed. using LiveContextMap = llvm::DenseMap; + static nanobind::ft_mutex live_contexts_mutex; static LiveContextMap &getLiveContexts(); // Interns all live modules associated with this context. Modules tracked @@ -276,6 +277,9 @@ class PyMlirContext { // attempt to access it will raise an error. using LiveOperationMap = llvm::DenseMap>; + nanobind::ft_mutex liveOperationsMutex; + + // Guarded by liveOperationsMutex in free-threading mode. LiveOperationMap liveOperations; bool emitErrorDiagnostics = false; diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp index 7c4064262012e..6f49431006605 100644 --- a/mlir/lib/Bindings/Python/MainModule.cpp +++ b/mlir/lib/Bindings/Python/MainModule.cpp @@ -30,12 +30,8 @@ NB_MODULE(_mlir, m) { .def_prop_rw("dialect_search_modules", &PyGlobals::getDialectSearchPrefixes, &PyGlobals::setDialectSearchPrefixes) - .def( - "append_dialect_search_prefix", - [](PyGlobals &self, std::string moduleName) { - self.getDialectSearchPrefixes().push_back(std::move(moduleName)); - }, - "module_name"_a) + .def("append_dialect_search_prefix", &PyGlobals::addDialectSearchPrefix, + "module_name"_a) .def( "_check_dialect_module_loaded", [](PyGlobals &self, const std::string &dialectNamespace) { @@ -76,7 +72,6 @@ NB_MODULE(_mlir, m) { nanobind::cast(opClass.attr("OPERATION_NAME")); PyGlobals::get().registerOperationImpl(operationName, opClass, replace); - // Dict-stuff the new opClass by name onto the dialect class. nb::object opClassName = opClass.attr("__name__"); dialectClass.attr(opClassName) = opClass; diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 838159d676545..d8fde3e765ac4 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -445,8 +445,13 @@ LogicalResult MmaOp::verify() { expectedResult.push_back(LLVM::LLVMStructType::getLiteral( context, {f32Ty, f32Ty, f32Ty, f32Ty})); break; - case MMATypes::f16: case MMATypes::bf16: + kFactor = 8; + multiplicandFragType = i32Ty; + expectedResult.push_back(LLVM::LLVMStructType::getLiteral( + context, {f32Ty, f32Ty, f32Ty, f32Ty})); + break; + case MMATypes::f16: kFactor = 8; multiplicandFragType = f16x2Ty; expectedResult.push_back(f16x2x2StructTy); diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp index b3bed5ab5f412..79dd3e3069648 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" #include "mlir/Analysis/SliceWalk.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/IR/Matchers.h" #include "mlir/Interfaces/DataLayoutInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" @@ -743,6 +744,14 @@ struct LLVMInlinerInterface : public DialectInlinerInterface { op->erase(); } + bool allowSingleBlockOptimization( + iterator_range inlinedBlocks) const final { + if (!inlinedBlocks.empty() && + isa(inlinedBlocks.begin()->getTerminator())) + return false; + return true; + } + /// Handle the given inlined return by replacing the uses of the call with the /// operands of the return. This overload is called when the inlined region /// only contains one block. @@ -815,3 +824,9 @@ void mlir::LLVM::registerInlinerInterface(DialectRegistry ®istry) { dialect->addInterfaces(); }); } + +void mlir::NVVM::registerInlinerInterface(DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, NVVM::NVVMDialect *dialect) { + dialect->addInterfaces(); + }); +} diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 8973e87c063b3..c13b663dbf05b 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2890,7 +2890,7 @@ FailureOr> SoftmaxOp::decomposeOperation(OpBuilder &b) { dims.erase(dims.begin() + reductionDim); // Step 1: Compute max along dim. Value outputReduce = b.create(loc, dims, elementType); - Value neutralForMaxF = arith::getIdentityValue(arith::AtomicRMWKind::maximumf, + Value neutralForMaxF = arith::getIdentityValue(arith::AtomicRMWKind::maxnumf, elementType, b, loc, /*useOnlyFiniteValue=*/true); Value neutralForMaxFInit = diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp index c360c61afd27b..20a43ef15d09e 100644 --- a/mlir/lib/TableGen/Operator.cpp +++ b/mlir/lib/TableGen/Operator.cpp @@ -503,8 +503,8 @@ void Operator::populateTypeInferenceInfo( for (int otherResultIndex : resultIndices) { if (resultIndex == otherResultIndex) continue; - inference[resultIndex].sources.emplace_back(otherResultIndex, - "$_self"); + inference[resultIndex].sources.emplace_back( + InferredResultType::unmapResultIndex(otherResultIndex), "$_self"); } } } diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp index 0db097d14cd3c..0cae63c58ca7b 100644 --- a/mlir/lib/Transforms/Utils/InliningUtils.cpp +++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp @@ -118,6 +118,18 @@ void InlinerInterface::handleTerminator(Operation *op, handler->handleTerminator(op, valuesToRepl); } +/// Returns true if the inliner can assume a fast path of not creating a +/// new block, if there is only one block. +bool InlinerInterface::allowSingleBlockOptimization( + iterator_range inlinedBlocks) const { + if (inlinedBlocks.empty()) { + return true; + } + auto *handler = getInterfaceFor(inlinedBlocks.begin()->getParentOp()); + assert(handler && "expected valid dialect handler"); + return handler->allowSingleBlockOptimization(inlinedBlocks); +} + Value InlinerInterface::handleArgument(OpBuilder &builder, Operation *call, Operation *callable, Value argument, DictionaryAttr argumentAttrs) const { @@ -294,8 +306,10 @@ inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock, interface.processInlinedCallBlocks(call, newBlocks); interface.processInlinedBlocks(newBlocks); + bool singleBlockFastPath = interface.allowSingleBlockOptimization(newBlocks); + // Handle the case where only a single block was inlined. - if (std::next(newBlocks.begin()) == newBlocks.end()) { + if (singleBlockFastPath && std::next(newBlocks.begin()) == newBlocks.end()) { // Run the result attribute handler on the terminator operands. Operation *firstBlockTerminator = firstNewBlock->getTerminator(); builder.setInsertionPoint(firstBlockTerminator); diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt index f240d6ef944ec..1a0075e829aef 100644 --- a/mlir/python/requirements.txt +++ b/mlir/python/requirements.txt @@ -2,4 +2,5 @@ nanobind>=2.4, <3.0 numpy>=1.19.5, <=2.1.2 pybind11>=2.10.0, <=2.13.6 PyYAML>=5.4.0, <=6.0.1 -ml_dtypes>=0.1.0, <=0.5.0 # provides several NumPy dtype extensions, including the bf16 +ml_dtypes>=0.1.0, <=0.6.0; python_version<"3.13" # provides several NumPy dtype extensions, including the bf16 +ml_dtypes>=0.5.0, <=0.6.0; python_version>="3.13" \ No newline at end of file diff --git a/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir b/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir new file mode 100644 index 0000000000000..6dc8ebb431508 --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir @@ -0,0 +1,16 @@ +// RUN: mlir-opt %s -inline -split-input-file | FileCheck %s + +// UNSUPPORTED: system-windows + +llvm.func @threadidx() -> i32 { + %tid = nvvm.read.ptx.sreg.tid.x : i32 + llvm.return %tid : i32 +} + +// CHECK-LABEL: func @caller +llvm.func @caller() -> i32 { + // CHECK-NOT: llvm.call @threadidx + // CHECK: nvvm.read.ptx.sreg.tid.x + %z = llvm.call @threadidx() : () -> (i32) + llvm.return %z : i32 +} diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir index edaac4da0b044..eb249a4771753 100644 --- a/mlir/test/Dialect/LLVMIR/inlining.mlir +++ b/mlir/test/Dialect/LLVMIR/inlining.mlir @@ -676,3 +676,19 @@ llvm.func @caller(%x : i32) -> i32 { %z = llvm.call @private_func(%x) : (i32) -> (i32) llvm.return %z : i32 } + +// ----- + +llvm.func @unreachable_func(%a : i32) -> i32 { + "llvm.intr.trap"() : () -> () + llvm.unreachable +} + +// CHECK-LABEL: func @caller +llvm.func @caller(%x : i32) -> i32 { + // CHECK-NOT: llvm.call @unreachable_func + // CHECK: llvm.intr.trap + // CHECK: llvm.unreachable + %z = llvm.call @unreachable_func(%x) : (i32) -> (i32) + llvm.return %z : i32 +} diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index a7bdceba01c1e..4c3b6648a41c0 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -163,6 +163,29 @@ func.func @nvvm_mma_m8n8k4_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>, llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> } +// CHECK-LABEL: @nvvm_mma_m16n8k8_bf16_bf16 +func.func @nvvm_mma_m16n8k8_bf16_bf16(%a0 : i32, %a1 : i32, %b0 : i32, + %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) { + // CHECK: nvvm.mma.sync A[{{.*}}] B[{{.*}}] C[{{.*}}] {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3] + {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, + multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, + shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)> +} + +// CHECK-LABEL: @nvvm_mma_m16n8k16_bf16_bf16 +func.func @nvvm_mma_m16n8k16_bf16_bf16(%a0 : i32, %a1 : i32, %a2 : i32, %a3 : i32, + %b0 : i32, %b1 : i32, + %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) { + // CHECK: nvvm.mma.sync A[{{.*}}] B[{{.*}}] C[{{.*}}] {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3] + {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, + multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, + shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)> +} + // CHECK-LABEL: @nvvm_mma_m8n8k16_s8_s8 func.func @nvvm_mma_m8n8k16_s8_s8(%a0 : i32, %b0 : i32, %c0 : i32, %c1 : i32) { diff --git a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir index 2e211d2fa7dbe..72acf43361f50 100644 --- a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir @@ -210,7 +210,7 @@ func.func @softmax(%arg0: tensor<2x16x32xf32>, %dst: tensor<2x16x32xf32>) -> ten // CHECK-LABEL: func.func @softmax( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<2x16x32xf32>, %[[DST:[a-zA-Z0-9_]+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { // CHECK-DAG: %[[D1:.+]] = tensor.empty() : tensor<2x16xf32> -// CHECK-DAG: %[[CST:.+]] = arith.constant -3.40282347E+38 : f32 +// CHECK-DAG: %[[CST:.+]] = arith.constant 0xFFC00000 : f32 // CHECK: %[[D2:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[D1]] : tensor<2x16xf32>) -> tensor<2x16xf32> // CHECK: %[[D3:.+]] = linalg.generic {indexing_maps = [#[[$MAP]], #[[$MAP1]]], iterator_types = ["parallel", // CHECK-SAME: "parallel", "reduction"]} ins(%[[ARG0]] : tensor<2x16x32xf32>) outs(%[[D2]] : tensor<2x16xf32>) { diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 2d7710e7cbf27..09e98765413f0 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -291,6 +291,18 @@ llvm.func @nvvm_mma_m16n8k16_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>, llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)> } +// CHECK-LABEL: @nvvm_mma_m16n8k16_bf16_bf16 +llvm.func @nvvm_mma_m16n8k16_bf16_bf16(%a0 : i32, %a1 : i32, %a2 : i32, %a3 : i32, + %b0 : i32, %b1 : i32, + %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) -> !llvm.struct<(f32, f32, f32, f32)> { + // CHECK: call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.bf16 + %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3] + {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, + multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, + shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)> +} + // f32 return type, f16 accumulate type // CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16 llvm.func @nvvm_mma_m16n8k16_f32_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>, diff --git a/mlir/test/mlir-tblgen/op-result.td b/mlir/test/mlir-tblgen/op-result.td index 51f8b0671a328..f668d9a5a6644 100644 --- a/mlir/test/mlir-tblgen/op-result.td +++ b/mlir/test/mlir-tblgen/op-result.td @@ -180,6 +180,27 @@ def OpL4 : NS_Op<"two_inference_edges", [ // CHECK: inferredReturnTypes[1] = odsInferredType1 // CHECK: inferredReturnTypes[2] = odsInferredType2 +def OpL5 : NS_Op<"op_with_same_but_unconstraint_results", + [AllTypesMatch<["result_a", "result_b"]>]> { + let results = (outs AnyType:$result_a, AnyType:$result_b); +} + +// CHECK-NOT: LogicalResult OpL5::inferReturnTypes + +def OpL6 : NS_Op<"op_with_same_and_constraint_results", + [AllTypesMatch<["result_a", "result_b", "result_c"]>]> { + let results = (outs AnyType:$result_a, AnyType:$result_b, I32:$result_c); +} + +// CHECK-LABEL: LogicalResult OpL6::inferReturnTypes +// CHECK-NOT: } +// CHECK: odsInferredType0 = odsBuilder.getIntegerType(32); +// CHECK: odsInferredType1 = odsBuilder.getIntegerType(32); +// CHECK: odsInferredType2 = odsBuilder.getIntegerType(32); +// CHECK: inferredReturnTypes[0] = odsInferredType0; +// CHECK: inferredReturnTypes[1] = odsInferredType1; +// CHECK: inferredReturnTypes[2] = odsInferredType2; + def OpM : NS_Op<"mix_diff_size_variadic_and_normal_results_op", [AttrSizedResultSegments]> { let results = (outs Variadic:$output1, AnyTensor:$output2, Optional:$output3); } diff --git a/mlir/test/python/multithreaded_tests.py b/mlir/test/python/multithreaded_tests.py new file mode 100644 index 0000000000000..6e1a668346872 --- /dev/null +++ b/mlir/test/python/multithreaded_tests.py @@ -0,0 +1,518 @@ +# RUN: %PYTHON %s +""" +This script generates multi-threaded tests to check free-threading mode using CPython compiled with TSAN. +Tests can be run using pytest: +```bash +python3.13t -mpytest -vvv multithreaded_tests.py +``` + +IMPORTANT. Running tests are not checking the correctness, but just the execution of the tests in multi-threaded context +and passing if no warnings reported by TSAN and failing otherwise. + + +Details on the generated tests and execution: +1) Multi-threaded execution: all generated tests are executed independently by +a pool of threads, running each test multiple times, see @multi_threaded for details + +2) Tests generation: we use existing tests: test/python/ir/*.py, +test/python/dialects/*.py, etc to generate multi-threaded tests. +In details, we perform the following: +a) we define a list of source tests to be used to generate multi-threaded tests, see `TEST_MODULES`. +b) we define `TestAllMultiThreaded` class and add existing tests to the class. See `add_existing_tests` method. +c) for each test file, we copy and modify it: test/python/ir/affine_expr.py -> /tmp/ir/affine_expr.py. +In order to import the test file as python module, we remove all executing functions, like +`@run` or `run(testMethod)`. See `copy_and_update` and `add_existing_tests` methods for details. + + +Observed warnings reported by TSAN. + +CPython and free-threading known data-races: +1) ctypes related races: https://github.com/python/cpython/issues/127945 +2) LLVM related data-races, llvm::raw_ostream is not thread-safe +- mlir pass manager +- dialects/transform_interpreter.py +- ir/diagnostic_handler.py +- ir/module.py +3) Dialect gpu module-to-binary method is unsafe +""" +import concurrent.futures +import gc +import importlib.util +import os +import sys +import threading +import tempfile +import unittest + +from contextlib import contextmanager +from functools import partial +from pathlib import Path +from typing import Optional, List + +import mlir.dialects.arith as arith +from mlir.dialects import transform +from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint + + +def import_from_path(module_name: str, file_path: Path): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +def copy_and_update(src_filepath: Path, dst_filepath: Path): + # We should remove all calls like `run(testMethod)` + with open(src_filepath, "r") as reader, open(dst_filepath, "w") as writer: + while True: + src_line = reader.readline() + if len(src_line) == 0: + break + skip_lines = [ + "run(", + "@run", + "@constructAndPrintInModule", + "run_apply_patterns(", + "@run_apply_patterns", + "@test_in_context", + "@construct_and_print_in_module", + ] + if any(src_line.startswith(line) for line in skip_lines): + continue + writer.write(src_line) + + +# Helper run functions +def run(f): + f() + + +def run_with_context_and_location(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + f() + return f + + +def run_with_insertion_point(f): + print("\nTEST:", f.__name__) + with Context() as ctx, Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + f(ctx) + print(module) + + +def run_with_insertion_point_v2(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + f() + print(module) + return f + + +def run_with_insertion_point_v3(f): + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + print("\nTEST:", f.__name__) + f(module) + print(module) + return f + + +def run_with_insertion_point_v4(f): + print("\nTEST:", f.__name__) + with Context() as ctx, Location.unknown(): + ctx.allow_unregistered_dialects = True + module = Module.create() + with InsertionPoint(module.body): + f() + return f + + +def run_apply_patterns(f): + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, + [], + transform.AnyOpType.get(), + ) + with InsertionPoint(sequence.body): + apply = transform.ApplyPatternsOp(sequence.bodyTarget) + with InsertionPoint(apply.patterns): + f() + transform.YieldOp() + print("\nTEST:", f.__name__) + print(module) + return f + + +def run_transform_tensor_ext(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, + [], + transform.AnyOpType.get(), + ) + with InsertionPoint(sequence.body): + f(sequence.bodyTarget) + transform.YieldOp() + print(module) + return f + + +def run_transform_structured_ext(f): + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + print("\nTEST:", f.__name__) + f() + module.operation.verify() + print(module) + return f + + +def run_construct_and_print_in_module(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + module = f(module) + if module is not None: + print(module) + return f + + +TEST_MODULES = [ + ("execution_engine", run), + ("pass_manager", run), + ("dialects/affine", run_with_insertion_point_v2), + ("dialects/func", run_with_insertion_point_v2), + ("dialects/arith_dialect", run), + ("dialects/arith_llvm", run), + ("dialects/async_dialect", run), + ("dialects/builtin", run), + ("dialects/cf", run_with_insertion_point_v4), + ("dialects/complex_dialect", run), + ("dialects/func", run_with_insertion_point_v2), + ("dialects/index_dialect", run_with_insertion_point), + ("dialects/llvm", run_with_insertion_point_v2), + ("dialects/math_dialect", run), + ("dialects/memref", run), + ("dialects/ml_program", run_with_insertion_point_v2), + ("dialects/nvgpu", run_with_insertion_point_v2), + ("dialects/nvvm", run_with_insertion_point_v2), + ("dialects/ods_helpers", run), + ("dialects/openmp_ops", run_with_insertion_point_v2), + ("dialects/pdl_ops", run_with_insertion_point_v2), + # ("dialects/python_test", run), # TODO: Need to pass pybind11 or nanobind argv + ("dialects/quant", run), + ("dialects/rocdl", run_with_insertion_point_v2), + ("dialects/scf", run_with_insertion_point_v2), + ("dialects/shape", run), + ("dialects/spirv_dialect", run), + ("dialects/tensor", run), + # ("dialects/tosa", ), # Nothing to test + ("dialects/transform_bufferization_ext", run_with_insertion_point_v2), + # ("dialects/transform_extras", ), # Needs a more complicated execution schema + ("dialects/transform_gpu_ext", run_transform_tensor_ext), + ( + "dialects/transform_interpreter", + run_with_context_and_location, + ["print_", "transform_options", "failed", "include"], + ), + ( + "dialects/transform_loop_ext", + run_with_insertion_point_v2, + ["loopOutline"], + ), + ("dialects/transform_memref_ext", run_with_insertion_point_v2), + ("dialects/transform_nvgpu_ext", run_with_insertion_point_v2), + ("dialects/transform_sparse_tensor_ext", run_transform_tensor_ext), + ("dialects/transform_structured_ext", run_transform_structured_ext), + ("dialects/transform_tensor_ext", run_transform_tensor_ext), + ( + "dialects/transform_vector_ext", + run_apply_patterns, + ["configurable_patterns"], + ), + ("dialects/transform", run_with_insertion_point_v3), + ("dialects/vector", run_with_context_and_location), + ("dialects/gpu/dialect", run_with_context_and_location), + ("dialects/gpu/module-to-binary-nvvm", run_with_context_and_location), + ("dialects/gpu/module-to-binary-rocdl", run_with_context_and_location), + ("dialects/linalg/ops", run), + # TO ADD: No proper tests in this dialects/linalg/opsdsl/* + # ("dialects/linalg/opsdsl/*", ...), + ("dialects/sparse_tensor/dialect", run), + ("dialects/sparse_tensor/passes", run), + ("integration/dialects/pdl", run_construct_and_print_in_module), + ("integration/dialects/transform", run_construct_and_print_in_module), + ("integration/dialects/linalg/opsrun", run), + ("ir/affine_expr", run), + ("ir/affine_map", run), + ("ir/array_attributes", run), + ("ir/attributes", run), + ("ir/blocks", run), + ("ir/builtin_types", run), + ("ir/context_managers", run), + ("ir/debug", run), + ("ir/diagnostic_handler", run), + ("ir/dialects", run), + ("ir/exception", run), + ("ir/insertion_point", run), + ("ir/integer_set", run), + ("ir/location", run), + ("ir/module", run), + ("ir/operation", run), + ("ir/symbol_table", run), + ("ir/value", run), +] + +TESTS_TO_SKIP = [ + "test_execution_engine__testNanoTime_multi_threaded", # testNanoTime can't run in multiple threads, even with GIL + "test_execution_engine__testSharedLibLoad_multi_threaded", # testSharedLibLoad can't run in multiple threads, even with GIL + "test_dialects_arith_dialect__testArithValue_multi_threaded", # RuntimeError: Value caster is already registered: .ArithValue'>, even with GIL + "test_ir_dialects__testAppendPrefixSearchPath_multi_threaded", # PyGlobals::setDialectSearchPrefixes is not thread-safe, even with GIL. Strange usage of static PyGlobals vs python exposed _cext.globals + "test_ir_value__testValueCasters_multi_threaded", # RuntimeError: Value caster is already registered: .dont_cast_int, even with GIL + # tests indirectly calling thread-unsafe llvm::raw_ostream + "test_execution_engine__testInvalidModule_multi_threaded", # mlirExecutionEngineCreate calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testPrintIrAfterAll_multi_threaded", # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testPrintIrBeforeAndAfterAll_multi_threaded", # IRPrinterInstrumentation::runBeforePass calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testPrintIrLargeLimitElements_multi_threaded", # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testPrintIrTree_multi_threaded", # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testRunPipeline_multi_threaded", # PrintOpStatsPass::printSummary calls thread-unsafe llvm::raw_ostream + "test_dialects_transform_interpreter__include_multi_threaded", # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream + "test_dialects_transform_interpreter__transform_options_multi_threaded", # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream + "test_dialects_transform_interpreter__print_self_multi_threaded", # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) call thread-unsafe llvm::raw_ostream + "test_ir_diagnostic_handler__testDiagnosticCallbackException_multi_threaded", # mlirEmitError calls thread-unsafe llvm::raw_ostream + "test_ir_module__testParseSuccess_multi_threaded", # mlirOperationDump calls thread-unsafe llvm::raw_ostream + # False-positive TSAN detected race in llvm::RuntimeDyldELF::registerEHFrames() + # Details: https://github.com/llvm/llvm-project/pull/107103/files#r1905726947 + "test_execution_engine__testCapsule_multi_threaded", + "test_execution_engine__testDumpToObjectFile_multi_threaded", +] + +TESTS_TO_XFAIL = [ + # execution_engine tests: + # - ctypes related data-races: https://github.com/python/cpython/issues/127945 + "test_execution_engine__testBF16Memref_multi_threaded", + "test_execution_engine__testBasicCallback_multi_threaded", + "test_execution_engine__testComplexMemrefAdd_multi_threaded", + "test_execution_engine__testComplexUnrankedMemrefAdd_multi_threaded", + "test_execution_engine__testDynamicMemrefAdd2D_multi_threaded", + "test_execution_engine__testF16MemrefAdd_multi_threaded", + "test_execution_engine__testF8E5M2Memref_multi_threaded", + "test_execution_engine__testInvokeFloatAdd_multi_threaded", + "test_execution_engine__testInvokeVoid_multi_threaded", # a ctypes race + "test_execution_engine__testMemrefAdd_multi_threaded", + "test_execution_engine__testRankedMemRefCallback_multi_threaded", + "test_execution_engine__testRankedMemRefWithOffsetCallback_multi_threaded", + "test_execution_engine__testUnrankedMemRefCallback_multi_threaded", + "test_execution_engine__testUnrankedMemRefWithOffsetCallback_multi_threaded", + # dialects tests + "test_dialects_memref__testSubViewOpInferReturnTypeExtensiveSlicing_multi_threaded", # Related to ctypes data races + "test_dialects_transform_interpreter__print_other_multi_threaded", # Fatal Python error: Aborted or mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) is not thread-safe + "test_dialects_gpu_module-to-binary-rocdl__testGPUToASMBin_multi_threaded", # Due to global llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp::GCNTrackers variable mutation + "test_dialects_gpu_module-to-binary-nvvm__testGPUToASMBin_multi_threaded", + "test_dialects_gpu_module-to-binary-nvvm__testGPUToLLVMBin_multi_threaded", + "test_dialects_gpu_module-to-binary-rocdl__testGPUToLLVMBin_multi_threaded", + # integration tests + "test_integration_dialects_linalg_opsrun__test_elemwise_builtin_multi_threaded", # Related to ctypes data races + "test_integration_dialects_linalg_opsrun__test_elemwise_generic_multi_threaded", # Related to ctypes data races + "test_integration_dialects_linalg_opsrun__test_fill_builtin_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_fill_generic_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_fill_rng_builtin_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_fill_rng_generic_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_max_pooling_builtin_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_max_pooling_generic_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_min_pooling_builtin_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_min_pooling_generic_multi_threaded", # ctypes +] + + +def add_existing_tests(test_modules, test_prefix: str = "_original_test"): + def decorator(test_cls): + this_folder = Path(__file__).parent.absolute() + test_cls.output_folder = tempfile.TemporaryDirectory() + output_folder = Path(test_cls.output_folder.name) + + for test_mod_info in test_modules: + assert isinstance(test_mod_info, tuple) and len(test_mod_info) in (2, 3) + if len(test_mod_info) == 2: + test_module_name, exec_fn = test_mod_info + test_pattern = None + else: + test_module_name, exec_fn, test_pattern = test_mod_info + + src_filepath = this_folder / f"{test_module_name}.py" + dst_filepath = (output_folder / f"{test_module_name}.py").absolute() + if not dst_filepath.parent.exists(): + dst_filepath.parent.mkdir(parents=True) + copy_and_update(src_filepath, dst_filepath) + test_mod = import_from_path(test_module_name, dst_filepath) + for attr_name in dir(test_mod): + is_test_fn = test_pattern is None and attr_name.startswith("test") + is_test_fn |= test_pattern is not None and any( + [p in attr_name for p in test_pattern] + ) + if is_test_fn: + obj = getattr(test_mod, attr_name) + if callable(obj): + test_name = f"{test_prefix}_{test_module_name.replace('/', '_')}__{attr_name}" + + def wrapped_test_fn( + self, *args, __test_fn__=obj, __exec_fn__=exec_fn, **kwargs + ): + __exec_fn__(__test_fn__) + + setattr(test_cls, test_name, wrapped_test_fn) + return test_cls + + return decorator + + +@contextmanager +def _capture_output(fp): + # Inspired from jax test_utils.py capture_stderr method + # ``None`` means nothing has not been captured yet. + captured = None + + def get_output() -> str: + if captured is None: + raise ValueError("get_output() called while the context is active.") + return captured + + with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f: + original_fd = os.dup(fp.fileno()) + os.dup2(f.fileno(), fp.fileno()) + try: + yield get_output + finally: + # Python also has its own buffers, make sure everything is flushed. + fp.flush() + os.fsync(fp.fileno()) + f.seek(0) + captured = f.read() + os.dup2(original_fd, fp.fileno()) + + +capture_stdout = partial(_capture_output, sys.stdout) +capture_stderr = partial(_capture_output, sys.stderr) + + +def multi_threaded( + num_workers: int, + num_runs: int = 5, + skip_tests: Optional[List[str]] = None, + xfail_tests: Optional[List[str]] = None, + test_prefix: str = "_original_test", + multithreaded_test_postfix: str = "_multi_threaded", +): + """Decorator that runs a test in a multi-threaded environment.""" + + def decorator(test_cls): + for name, test_fn in test_cls.__dict__.copy().items(): + if not (name.startswith(test_prefix) and callable(test_fn)): + continue + + name = f"test{name[len(test_prefix):]}" + if skip_tests is not None: + if any( + test_name.replace(multithreaded_test_postfix, "") in name + for test_name in skip_tests + ): + continue + + def multi_threaded_test_fn(self, *args, __test_fn__=test_fn, **kwargs): + with capture_stdout(), capture_stderr() as get_output: + barrier = threading.Barrier(num_workers) + + def closure(): + barrier.wait() + for _ in range(num_runs): + __test_fn__(self, *args, **kwargs) + + with concurrent.futures.ThreadPoolExecutor( + max_workers=num_workers + ) as executor: + futures = [] + for _ in range(num_workers): + futures.append(executor.submit(closure)) + # We should call future.result() to re-raise an exception if test has + # failed + assert len(list(f.result() for f in futures)) == num_workers + + gc.collect() + assert Context._get_live_count() == 0 + + captured = get_output() + if len(captured) > 0 and "ThreadSanitizer" in captured: + raise RuntimeError( + f"ThreadSanitizer reported warnings:\n{captured}" + ) + + test_new_name = f"{name}{multithreaded_test_postfix}" + if xfail_tests is not None and test_new_name in xfail_tests: + multi_threaded_test_fn = unittest.expectedFailure( + multi_threaded_test_fn + ) + + setattr(test_cls, test_new_name, multi_threaded_test_fn) + + return test_cls + + return decorator + + +@multi_threaded( + num_workers=10, + num_runs=20, + skip_tests=TESTS_TO_SKIP, + xfail_tests=TESTS_TO_XFAIL, +) +@add_existing_tests(test_modules=TEST_MODULES, test_prefix="_original_test") +class TestAllMultiThreaded(unittest.TestCase): + @classmethod + def tearDownClass(cls): + if hasattr(cls, "output_folder"): + cls.output_folder.cleanup() + + def _original_test_create_context(self): + with Context() as ctx: + print(ctx._get_live_count()) + print(ctx._get_live_module_count()) + print(ctx._get_live_operation_count()) + print(ctx._get_live_operation_objects()) + print(ctx._get_context_again() is ctx) + print(ctx._clear_live_operations()) + + def _original_test_create_module_with_consts(self): + py_values = [123, 234, 345] + with Context() as ctx: + module = Module.create(loc=Location.file("foo.txt", 0, 0)) + + dtype = IntegerType.get_signless(64) + with InsertionPoint(module.body), Location.name("a"): + arith.constant(dtype, py_values[0]) + + with InsertionPoint(module.body), Location.name("b"): + arith.constant(dtype, py_values[1]) + + with InsertionPoint(module.body), Location.name("c"): + arith.constant(dtype, py_values[2]) + + +if __name__ == "__main__": + # Do not run the tests on CPython with GIL + if hasattr(sys, "_is_gil_enabled") and not sys._is_gil_enabled(): + unittest.main() diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in index d8a0b6ae3a3b2..f22063e796def 100644 --- a/polly/test/lit.site.cfg.in +++ b/polly/test/lit.site.cfg.in @@ -14,7 +14,7 @@ config.extra_paths = "@POLLY_TEST_EXTRA_PATHS@".split(";") ## Check the current platform with regex import re EAT_ERR_ON_X86 = ' ' -if (re.match(r'^x86_64*', '@LLVM_TARGET_TRIPLE@') == None) : +if (re.match(r'^x86_64*', '@LLVM_TARGET_TRIPLE@') is None) : EAT_ERR_ON_X86 = '|| echo \"error is eaten\"' for arch in config.targets_to_build.split(): diff --git a/polly/test/update_check.py b/polly/test/update_check.py index 88d95c247c063..a973c72ff4e78 100644 --- a/polly/test/update_check.py +++ b/polly/test/update_check.py @@ -222,7 +222,12 @@ def classyfier2(lines): line = i.__next__() -replrepl = {"{{": "{{[{][{]}}", "}}": "{{[}][}]}}", "[[": "{{\[\[}}", "]]": "{{\]\]}}"} +replrepl = { + "{{": "{{[{][{]}}", + "}}": "{{[}][}]}}", + "[[": r"{{\[\[}}", + "]]": r"{{\]\]}}", +} replre = re.compile("|".join(re.escape(k) for k in replrepl.keys())) @@ -452,7 +457,7 @@ def main(): checkre = re.compile( r"^\s*\;\s*(" + "|".join([re.escape(s) for s in checkprefixes]) - + ")(\-NEXT|\-DAG|\-NOT|\-LABEL|\-SAME)?\s*\:" + + r")(\-NEXT|\-DAG|\-NOT|\-LABEL|\-SAME)?\s*\:" ) firstcheckline = None firstnoncommentline = None diff --git a/polly/utils/pyscop/isl.py b/polly/utils/pyscop/isl.py index 5eaf7798e20b9..c06b7bca28042 100644 --- a/polly/utils/pyscop/isl.py +++ b/polly/utils/pyscop/isl.py @@ -24,7 +24,7 @@ def from_ptr(ptr): @staticmethod def getDefaultInstance(): - if Context.defaultInstance == None: + if Context.defaultInstance is None: Context.defaultInstance = Context() return Context.defaultInstance @@ -33,12 +33,12 @@ def getDefaultInstance(): class IslObject: def __init__(self, string="", ctx=None, ptr=None): self.initialize_isl_methods() - if ptr != None: + if ptr is not None: self.ptr = ptr self.ctx = self.get_isl_method("get_ctx")(self) return - if ctx == None: + if ctx is None: ctx = Context.getDefaultInstance() self.ctx = ctx @@ -236,7 +236,7 @@ class Printer: FORMAT_EXT_POLYLIB = 6 def __init__(self, ctx=None): - if ctx == None: + if ctx is None: ctx = Context.getDefaultInstance() self.ctx = ctx