diff --git a/bench/boost/optimized/dump_ssse3.ll b/bench/boost/optimized/dump_ssse3.ll
index 56f3bb8c746..551f5400261 100644
--- a/bench/boost/optimized/dump_ssse3.ll
+++ b/bench/boost/optimized/dump_ssse3.ll
@@ -1263,9 +1263,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux32dump_data_char_ssse3_slow_ps
   %30 = or disjoint <16 x i8> %27, splat (i8 48)
   %31 = bitcast <2 x i64> %.084.i to <16 x i8>
   %32 = select <16 x i1> %26, <16 x i8> %31, <16 x i8> zeroinitializer
-  %33 = add <16 x i8> %32, %29
+  %33 = add nuw nsw <16 x i8> %32, %29
   %34 = select <16 x i1> %28, <16 x i8> %31, <16 x i8> zeroinitializer
-  %35 = add <16 x i8> %34, %30
+  %35 = add nuw nsw <16 x i8> %34, %30
   %36 = shufflevector <16 x i8> %33, <16 x i8> %35, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %37 = shufflevector <16 x i8> %33, <16 x i8> %35, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %38 = shufflevector <16 x i8> %36, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -1345,9 +1345,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux32dump_data_char_ssse3_slow_ps
   %74 = or disjoint <16 x i8> %69, splat (i8 48)
   %75 = or disjoint <16 x i8> %72, splat (i8 48)
   %76 = select <16 x i1> %70, <16 x i8> %55, <16 x i8> zeroinitializer
-  %77 = add <16 x i8> %76, %74
+  %77 = add nuw nsw <16 x i8> %76, %74
   %78 = select <16 x i1> %73, <16 x i8> %55, <16 x i8> zeroinitializer
-  %79 = add <16 x i8> %78, %75
+  %79 = add nuw nsw <16 x i8> %78, %75
   %80 = shufflevector <16 x i8> %77, <16 x i8> %79, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %81 = shufflevector <16 x i8> %77, <16 x i8> %79, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %82 = shufflevector <16 x i8> %80, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -1384,9 +1384,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux32dump_data_char_ssse3_slow_ps
   %104 = or disjoint <16 x i8> %99, splat (i8 48)
   %105 = or disjoint <16 x i8> %102, splat (i8 48)
   %106 = select <16 x i1> %100, <16 x i8> %58, <16 x i8> zeroinitializer
-  %107 = add <16 x i8> %106, %104
+  %107 = add nuw nsw <16 x i8> %106, %104
   %108 = select <16 x i1> %103, <16 x i8> %58, <16 x i8> zeroinitializer
-  %109 = add <16 x i8> %108, %105
+  %109 = add nuw nsw <16 x i8> %108, %105
   %110 = shufflevector <16 x i8> %107, <16 x i8> %109, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %111 = shufflevector <16 x i8> %107, <16 x i8> %109, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %112 = shufflevector <16 x i8> %110, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -1509,9 +1509,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux33dump_data_wchar_ssse3_slow_p
   %30 = or disjoint <16 x i8> %27, splat (i8 48)
   %31 = bitcast <2 x i64> %.084.i to <16 x i8>
   %32 = select <16 x i1> %26, <16 x i8> %31, <16 x i8> zeroinitializer
-  %33 = add <16 x i8> %32, %29
+  %33 = add nuw nsw <16 x i8> %32, %29
   %34 = select <16 x i1> %28, <16 x i8> %31, <16 x i8> zeroinitializer
-  %35 = add <16 x i8> %34, %30
+  %35 = add nuw nsw <16 x i8> %34, %30
   %36 = shufflevector <16 x i8> %33, <16 x i8> %35, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %37 = shufflevector <16 x i8> %33, <16 x i8> %35, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %38 = shufflevector <16 x i8> %36, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -1634,9 +1634,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux33dump_data_wchar_ssse3_slow_p
   %108 = or disjoint <16 x i8> %103, splat (i8 48)
   %109 = or disjoint <16 x i8> %106, splat (i8 48)
   %110 = select <16 x i1> %104, <16 x i8> %88, <16 x i8> zeroinitializer
-  %111 = add <16 x i8> %110, %108
+  %111 = add nuw nsw <16 x i8> %110, %108
   %112 = select <16 x i1> %107, <16 x i8> %88, <16 x i8> zeroinitializer
-  %113 = add <16 x i8> %112, %109
+  %113 = add nuw nsw <16 x i8> %112, %109
   %114 = shufflevector <16 x i8> %111, <16 x i8> %113, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %115 = shufflevector <16 x i8> %111, <16 x i8> %113, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %116 = shufflevector <16 x i8> %114, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -1715,9 +1715,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux33dump_data_wchar_ssse3_slow_p
   %171 = or disjoint <16 x i8> %166, splat (i8 48)
   %172 = or disjoint <16 x i8> %169, splat (i8 48)
   %173 = select <16 x i1> %167, <16 x i8> %91, <16 x i8> zeroinitializer
-  %174 = add <16 x i8> %173, %171
+  %174 = add nuw nsw <16 x i8> %173, %171
   %175 = select <16 x i1> %170, <16 x i8> %91, <16 x i8> zeroinitializer
-  %176 = add <16 x i8> %175, %172
+  %176 = add nuw nsw <16 x i8> %175, %172
   %177 = shufflevector <16 x i8> %174, <16 x i8> %176, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %178 = shufflevector <16 x i8> %174, <16 x i8> %176, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %179 = shufflevector <16 x i8> %177, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -1885,9 +1885,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux34dump_data_char16_ssse3_slow_
   %30 = or disjoint <16 x i8> %27, splat (i8 48)
   %31 = bitcast <2 x i64> %.084.i to <16 x i8>
   %32 = select <16 x i1> %26, <16 x i8> %31, <16 x i8> zeroinitializer
-  %33 = add <16 x i8> %32, %29
+  %33 = add nuw nsw <16 x i8> %32, %29
   %34 = select <16 x i1> %28, <16 x i8> %31, <16 x i8> zeroinitializer
-  %35 = add <16 x i8> %34, %30
+  %35 = add nuw nsw <16 x i8> %34, %30
   %36 = shufflevector <16 x i8> %33, <16 x i8> %35, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %37 = shufflevector <16 x i8> %33, <16 x i8> %35, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %38 = shufflevector <16 x i8> %36, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -1980,9 +1980,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux34dump_data_char16_ssse3_slow_
   %84 = or disjoint <16 x i8> %79, splat (i8 48)
   %85 = or disjoint <16 x i8> %82, splat (i8 48)
   %86 = select <16 x i1> %80, <16 x i8> %64, <16 x i8> zeroinitializer
-  %87 = add <16 x i8> %86, %84
+  %87 = add nuw nsw <16 x i8> %86, %84
   %88 = select <16 x i1> %83, <16 x i8> %64, <16 x i8> zeroinitializer
-  %89 = add <16 x i8> %88, %85
+  %89 = add nuw nsw <16 x i8> %88, %85
   %90 = shufflevector <16 x i8> %87, <16 x i8> %89, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %91 = shufflevector <16 x i8> %87, <16 x i8> %89, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %92 = shufflevector <16 x i8> %90, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -2031,9 +2031,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux34dump_data_char16_ssse3_slow_
   %123 = or disjoint <16 x i8> %118, splat (i8 48)
   %124 = or disjoint <16 x i8> %121, splat (i8 48)
   %125 = select <16 x i1> %119, <16 x i8> %67, <16 x i8> zeroinitializer
-  %126 = add <16 x i8> %125, %123
+  %126 = add nuw nsw <16 x i8> %125, %123
   %127 = select <16 x i1> %122, <16 x i8> %67, <16 x i8> zeroinitializer
-  %128 = add <16 x i8> %127, %124
+  %128 = add nuw nsw <16 x i8> %127, %124
   %129 = shufflevector <16 x i8> %126, <16 x i8> %128, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %130 = shufflevector <16 x i8> %126, <16 x i8> %128, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %131 = shufflevector <16 x i8> %129, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -2171,9 +2171,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux34dump_data_char32_ssse3_slow_
   %30 = or disjoint <16 x i8> %27, splat (i8 48)
   %31 = bitcast <2 x i64> %.084.i to <16 x i8>
   %32 = select <16 x i1> %26, <16 x i8> %31, <16 x i8> zeroinitializer
-  %33 = add <16 x i8> %32, %29
+  %33 = add nuw nsw <16 x i8> %32, %29
   %34 = select <16 x i1> %28, <16 x i8> %31, <16 x i8> zeroinitializer
-  %35 = add <16 x i8> %34, %30
+  %35 = add nuw nsw <16 x i8> %34, %30
   %36 = shufflevector <16 x i8> %33, <16 x i8> %35, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %37 = shufflevector <16 x i8> %33, <16 x i8> %35, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %38 = shufflevector <16 x i8> %36, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -2296,9 +2296,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux34dump_data_char32_ssse3_slow_
   %108 = or disjoint <16 x i8> %103, splat (i8 48)
   %109 = or disjoint <16 x i8> %106, splat (i8 48)
   %110 = select <16 x i1> %104, <16 x i8> %88, <16 x i8> zeroinitializer
-  %111 = add <16 x i8> %110, %108
+  %111 = add nuw nsw <16 x i8> %110, %108
   %112 = select <16 x i1> %107, <16 x i8> %88, <16 x i8> zeroinitializer
-  %113 = add <16 x i8> %112, %109
+  %113 = add nuw nsw <16 x i8> %112, %109
   %114 = shufflevector <16 x i8> %111, <16 x i8> %113, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %115 = shufflevector <16 x i8> %111, <16 x i8> %113, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %116 = shufflevector <16 x i8> %114, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
@@ -2377,9 +2377,9 @@ define hidden void @_ZN5boost3log11v2_mt_posix3aux34dump_data_char32_ssse3_slow_
   %171 = or disjoint <16 x i8> %166, splat (i8 48)
   %172 = or disjoint <16 x i8> %169, splat (i8 48)
   %173 = select <16 x i1> %167, <16 x i8> %91, <16 x i8> zeroinitializer
-  %174 = add <16 x i8> %173, %171
+  %174 = add nuw nsw <16 x i8> %173, %171
   %175 = select <16 x i1> %170, <16 x i8> %91, <16 x i8> zeroinitializer
-  %176 = add <16 x i8> %175, %172
+  %176 = add nuw nsw <16 x i8> %175, %172
   %177 = shufflevector <16 x i8> %174, <16 x i8> %176, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %178 = shufflevector <16 x i8> %174, <16 x i8> %176, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %179 = shufflevector <16 x i8> %177, <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 0, i32 1, i32 16, i32 2, i32 3, i32 16, i32 4, i32 5, i32 16, i32 6, i32 7, i32 16, i32 8, i32 9, i32 16>
diff --git a/bench/libquic/optimized/poly1305_vec.ll b/bench/libquic/optimized/poly1305_vec.ll
index 956b4f03116..cd6a7d63a25 100644
--- a/bench/libquic/optimized/poly1305_vec.ll
+++ b/bench/libquic/optimized/poly1305_vec.ll
@@ -1228,7 +1228,7 @@ poly1305_combine.exit:                            ; preds = %12, %24
   %309 = add <2 x i64> %299, %308
   %310 = bitcast <2 x i64> %301 to <4 x i32>
   %311 = extractelement <4 x i32> %310, i64 0
-  %312 = sext i32 %311 to i64
+  %312 = zext nneg i32 %311 to i64
   %313 = lshr i64 %312, 26
   %314 = and i64 %312, 67108863
   %315 = bitcast <2 x i64> %303 to <4 x i32>
@@ -1239,13 +1239,13 @@ poly1305_combine.exit:                            ; preds = %12, %24
   %320 = and i64 %318, 67108863
   %321 = bitcast <2 x i64> %305 to <4 x i32>
   %322 = extractelement <4 x i32> %321, i64 0
-  %323 = sext i32 %322 to i64
-  %324 = add nsw i64 %319, %323
+  %323 = zext nneg i32 %322 to i64
+  %324 = add nuw nsw i64 %319, %323
   %325 = lshr i64 %324, 26
   %326 = bitcast <2 x i64> %307 to <4 x i32>
   %327 = extractelement <4 x i32> %326, i64 0
-  %328 = sext i32 %327 to i64
-  %329 = add nsw i64 %325, %328
+  %328 = zext nneg i32 %327 to i64
+  %329 = add nuw nsw i64 %325, %328
   %330 = lshr i64 %329, 26
   %331 = and i64 %329, 67108863
   %332 = bitcast <2 x i64> %309 to <4 x i32>
@@ -1263,7 +1263,7 @@ poly1305_combine.exit:                            ; preds = %12, %24
   %343 = or disjoint i64 %.masked.i, %340
   store i64 %343, ptr %.phi.trans.insert, align 32, !tbaa !10
   %344 = lshr i64 %341, 18
-  %345 = shl nsw i64 %324, 8
+  %345 = shl nuw nsw i64 %324, 8
   %346 = and i64 %345, 17179868928
   %347 = shl nuw nsw i64 %331, 34
   %.masked366.i = and i64 %347, 17575006175232
diff --git a/bench/libwebp/optimized/lossless_enc_sse2.ll b/bench/libwebp/optimized/lossless_enc_sse2.ll
index 79268b5caf8..35fa11d32a6 100644
--- a/bench/libwebp/optimized/lossless_enc_sse2.ll
+++ b/bench/libwebp/optimized/lossless_enc_sse2.ll
@@ -140,7 +140,7 @@ define internal void @TransformColor_SSE2(ptr noalias noundef %0, ptr noalias no
   %38 = shl <8 x i16> %34, splat (i16 8)
   %39 = tail call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %38, <8 x i16> %29)
   %40 = bitcast <8 x i16> %39 to <4 x i32>
-  %41 = lshr <4 x i32> %40, splat (i32 16)
+  %41 = lshr exact <4 x i32> %40, splat (i32 16)
   %42 = bitcast <4 x i32> %41 to <16 x i8>
   %43 = bitcast <8 x i16> %37 to <16 x i8>
   %44 = add <16 x i8> %42, %43
@@ -233,9 +233,9 @@ define internal void @CollectColorBlueTransforms_SSE2(ptr noalias noundef %0, i3
   %44 = bitcast <2 x i64> %31 to <16 x i8>
   %45 = bitcast <8 x i16> %41 to <16 x i8>
   %46 = bitcast <8 x i16> %36 to <4 x i32>
-  %47 = lshr <4 x i32> %46, splat (i32 16)
+  %47 = lshr exact <4 x i32> %46, splat (i32 16)
   %48 = bitcast <8 x i16> %37 to <4 x i32>
-  %49 = lshr <4 x i32> %48, splat (i32 16)
+  %49 = lshr exact <4 x i32> %48, splat (i32 16)
   %50 = bitcast <4 x i32> %47 to <16 x i8>
   %51 = add <16 x i8> %50, %43
   %52 = sub <16 x i8> %42, %51
@@ -968,7 +968,7 @@ define internal void @BundleColorMap_SSE2(ptr noalias noundef %0, i32 noundef %1
   %37 = lshr <4 x i32> %36, splat (i32 12)
   %38 = bitcast <4 x i32> %37 to <2 x i64>
   %39 = or <2 x i64> %35, splat (i64 -72057589759737856)
-  %40 = or <2 x i64> %39, %38
+  %40 = or disjoint <2 x i64> %39, %38
   store <2 x i64> %40, ptr %.3103, align 1, !tbaa !7
   %41 = getelementptr inbounds nuw i8, ptr %.3103, i64 16
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
diff --git a/bench/libwebp/optimized/lossless_sse2.ll b/bench/libwebp/optimized/lossless_sse2.ll
index 4676b0941c8..48e6de0c24c 100644
--- a/bench/libwebp/optimized/lossless_sse2.ll
+++ b/bench/libwebp/optimized/lossless_sse2.ll
@@ -1332,7 +1332,7 @@ define internal void @TransformColorInverse_SSE2(ptr noundef %0, ptr noundef %1,
   %43 = shl <8 x i16> %42, splat (i16 8)
   %44 = tail call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %43, <8 x i16> %30)
   %45 = bitcast <8 x i16> %44 to <4 x i32>
-  %46 = lshr <4 x i32> %45, splat (i32 8)
+  %46 = lshr exact <4 x i32> %45, splat (i32 8)
   %47 = bitcast <4 x i32> %46 to <16 x i8>
   %48 = bitcast <8 x i16> %43 to <16 x i8>
   %49 = add <16 x i8> %48, %47
diff --git a/bench/llama.cpp/optimized/ggml-cpu-quants.ll b/bench/llama.cpp/optimized/ggml-cpu-quants.ll
index 4b77ad85c61..e5d41f0e728 100644
--- a/bench/llama.cpp/optimized/ggml-cpu-quants.ll
+++ b/bench/llama.cpp/optimized/ggml-cpu-quants.ll
@@ -1703,7 +1703,7 @@ define void @ggml_vec_dot_q4_K_q8_K(i32 noundef %0, ptr noalias noundef writeonl
   %50 = insertelement <4 x i32> %49, i32 %40, i64 2
   %51 = insertelement <4 x i32> %50, i32 %39, i64 3
   %52 = bitcast <4 x i32> %51 to <16 x i8>
-  %53 = zext <16 x i8> %52 to <16 x i16>
+  %53 = zext nneg <16 x i8> %52 to <16 x i16>
   %54 = getelementptr inbounds nuw i8, ptr %22, i64 260
   %55 = load <4 x i64>, ptr %54, align 1, !tbaa !4
   %56 = bitcast <16 x i16> %53 to <8 x i32>
@@ -1846,7 +1846,7 @@ define void @ggml_vec_dot_q5_K_q8_K(i32 noundef %0, ptr noalias noundef writeonl
   %47 = insertelement <4 x i32> %46, i32 %39, i64 2
   %48 = insertelement <4 x i32> %47, i32 %38, i64 3
   %49 = bitcast <4 x i32> %48 to <16 x i8>
-  %50 = zext <16 x i8> %49 to <16 x i16>
+  %50 = zext nneg <16 x i8> %49 to <16 x i16>
   %51 = getelementptr inbounds nuw i8, ptr %21, i64 260
   %52 = load <4 x i64>, ptr %51, align 1, !tbaa !4
   %53 = bitcast <16 x i16> %50 to <8 x i32>
diff --git a/bench/ncnn/optimized/imreadwrite.ll b/bench/ncnn/optimized/imreadwrite.ll
index 7851ea6b3ad..3b136af7c15 100644
--- a/bench/ncnn/optimized/imreadwrite.ll
+++ b/bench/ncnn/optimized/imreadwrite.ll
@@ -11166,274 +11166,270 @@ define internal void @_ZL15stbi__idct_simdPhiPs(ptr noundef writeonly captures(n
   %28 = bitcast <8 x i16> %27 to <4 x i32>
   %29 = ashr exact <4 x i32> %28, splat (i32 4)
   %30 = shufflevector <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %25, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %31 = bitcast <8 x i16> %30 to <4 x i32>
-  %32 = ashr exact <4 x i32> %31, splat (i32 4)
-  %33 = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i16> %26, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %34 = bitcast <8 x i16> %33 to <4 x i32>
-  %35 = ashr exact <4 x i32> %34, splat (i32 4)
-  %36 = shufflevector <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %26, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %37 = bitcast <8 x i16> %36 to <4 x i32>
-  %38 = ashr exact <4 x i32> %37, splat (i32 4)
-  %39 = sub <4 x i32> %29, %23
-  %40 = sub <4 x i32> %32, %24
-  %41 = sub <4 x i32> %35, %21
-  %42 = sub <4 x i32> %38, %22
-  %43 = shufflevector <8 x i16> %18, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %44 = shufflevector <8 x i16> %18, <8 x i16> %10, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %45 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %43, <8 x i16> <i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034>)
-  %46 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %44, <8 x i16> <i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034>)
-  %47 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %43, <8 x i16> <i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552>)
-  %48 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %44, <8 x i16> <i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552>)
-  %49 = shufflevector <8 x i16> %14, <8 x i16> %6, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %50 = shufflevector <8 x i16> %14, <8 x i16> %6, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %51 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %49, <8 x i16> <i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597>)
-  %52 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %50, <8 x i16> <i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597>)
-  %53 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %49, <8 x i16> <i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552>)
-  %54 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %50, <8 x i16> <i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552>)
-  %55 = add <8 x i16> %18, %6
-  %56 = add <8 x i16> %14, %10
-  %57 = shufflevector <8 x i16> %55, <8 x i16> %56, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %58 = shufflevector <8 x i16> %55, <8 x i16> %56, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %59 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %57, <8 x i16> <i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816>)
-  %60 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %58, <8 x i16> <i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816>)
-  %61 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %57, <8 x i16> <i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681>)
-  %62 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %58, <8 x i16> <i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681>)
-  %63 = add <4 x i32> %59, %45
-  %64 = add <4 x i32> %60, %46
-  %65 = add <4 x i32> %61, %51
-  %66 = add <4 x i32> %62, %52
-  %67 = add <4 x i32> %61, %47
-  %68 = add <4 x i32> %62, %48
-  %69 = add <4 x i32> %59, %53
-  %70 = add <4 x i32> %60, %54
-  %71 = or disjoint <4 x i32> %29, splat (i32 512)
-  %72 = add <4 x i32> %71, %23
-  %73 = or disjoint <4 x i32> %32, splat (i32 512)
-  %74 = add <4 x i32> %73, %24
-  %75 = add <4 x i32> %69, %72
-  %76 = add <4 x i32> %70, %74
-  %77 = sub <4 x i32> %72, %69
-  %78 = sub <4 x i32> %74, %70
-  %79 = ashr <4 x i32> %75, splat (i32 10)
-  %80 = ashr <4 x i32> %76, splat (i32 10)
-  %81 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %79, <4 x i32> %80)
-  %82 = ashr <4 x i32> %77, splat (i32 10)
-  %83 = ashr <4 x i32> %78, splat (i32 10)
-  %84 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %82, <4 x i32> %83)
-  %85 = add <4 x i32> %21, splat (i32 512)
-  %86 = add <4 x i32> %85, %35
-  %87 = or disjoint <4 x i32> %38, splat (i32 512)
-  %88 = add <4 x i32> %87, %22
-  %89 = add <4 x i32> %67, %86
-  %90 = add <4 x i32> %68, %88
-  %91 = sub <4 x i32> %86, %67
-  %92 = sub <4 x i32> %88, %68
-  %93 = ashr <4 x i32> %89, splat (i32 10)
-  %94 = ashr <4 x i32> %90, splat (i32 10)
-  %95 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %93, <4 x i32> %94)
-  %96 = ashr <4 x i32> %91, splat (i32 10)
-  %97 = ashr <4 x i32> %92, splat (i32 10)
-  %98 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %96, <4 x i32> %97)
-  %99 = add <4 x i32> %41, splat (i32 512)
-  %100 = add <4 x i32> %42, splat (i32 512)
-  %101 = add <4 x i32> %65, %99
-  %102 = add <4 x i32> %66, %100
-  %103 = sub <4 x i32> %99, %65
-  %104 = sub <4 x i32> %100, %66
-  %105 = ashr <4 x i32> %101, splat (i32 10)
-  %106 = ashr <4 x i32> %102, splat (i32 10)
-  %107 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %105, <4 x i32> %106)
-  %108 = ashr <4 x i32> %103, splat (i32 10)
-  %109 = ashr <4 x i32> %104, splat (i32 10)
-  %110 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %108, <4 x i32> %109)
-  %111 = add <4 x i32> %39, splat (i32 512)
-  %112 = add <4 x i32> %40, splat (i32 512)
-  %113 = add <4 x i32> %63, %111
-  %114 = add <4 x i32> %64, %112
-  %115 = sub <4 x i32> %111, %63
-  %116 = sub <4 x i32> %112, %64
-  %117 = ashr <4 x i32> %113, splat (i32 10)
-  %118 = ashr <4 x i32> %114, splat (i32 10)
-  %119 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %117, <4 x i32> %118)
-  %120 = ashr <4 x i32> %115, splat (i32 10)
-  %121 = ashr <4 x i32> %116, splat (i32 10)
-  %122 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %120, <4 x i32> %121)
-  %123 = shufflevector <8 x i16> %81, <8 x i16> %122, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %124 = shufflevector <8 x i16> %81, <8 x i16> %122, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %125 = shufflevector <8 x i16> %95, <8 x i16> %110, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %126 = shufflevector <8 x i16> %95, <8 x i16> %110, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %127 = shufflevector <8 x i16> %107, <8 x i16> %98, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %128 = shufflevector <8 x i16> %107, <8 x i16> %98, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %129 = shufflevector <8 x i16> %119, <8 x i16> %84, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %130 = shufflevector <8 x i16> %119, <8 x i16> %84, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %31 = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i16> %26, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %32 = bitcast <8 x i16> %31 to <4 x i32>
+  %33 = ashr exact <4 x i32> %32, splat (i32 4)
+  %34 = shufflevector <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %26, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %35 = bitcast <8 x i16> %30 to <4 x i32>
+  %36 = sub <4 x i32> %29, %23
+  %37 = sub <4 x i32> %35, %24
+  %38 = bitcast <8 x i16> %34 to <4 x i32>
+  %39 = sub <4 x i32> %33, %21
+  %40 = sub <4 x i32> %38, %22
+  %41 = shufflevector <8 x i16> %18, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %42 = shufflevector <8 x i16> %18, <8 x i16> %10, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %43 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %41, <8 x i16> <i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034>)
+  %44 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %42, <8 x i16> <i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034>)
+  %45 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %41, <8 x i16> <i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552>)
+  %46 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %42, <8 x i16> <i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552>)
+  %47 = shufflevector <8 x i16> %14, <8 x i16> %6, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %48 = shufflevector <8 x i16> %14, <8 x i16> %6, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %49 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %47, <8 x i16> <i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597>)
+  %50 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %48, <8 x i16> <i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597>)
+  %51 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %47, <8 x i16> <i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552>)
+  %52 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %48, <8 x i16> <i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552>)
+  %53 = add <8 x i16> %18, %6
+  %54 = add <8 x i16> %14, %10
+  %55 = shufflevector <8 x i16> %53, <8 x i16> %54, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %56 = shufflevector <8 x i16> %53, <8 x i16> %54, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %57 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %55, <8 x i16> <i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816>)
+  %58 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %56, <8 x i16> <i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816>)
+  %59 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %55, <8 x i16> <i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681>)
+  %60 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %56, <8 x i16> <i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681>)
+  %61 = add <4 x i32> %57, %43
+  %62 = add <4 x i32> %58, %44
+  %63 = add <4 x i32> %59, %49
+  %64 = add <4 x i32> %60, %50
+  %65 = add <4 x i32> %59, %45
+  %66 = add <4 x i32> %60, %46
+  %67 = add <4 x i32> %57, %51
+  %68 = add <4 x i32> %58, %52
+  %69 = or disjoint <4 x i32> %29, splat (i32 512)
+  %70 = add <4 x i32> %69, %23
+  %71 = or disjoint <4 x i32> %35, splat (i32 512)
+  %72 = add <4 x i32> %71, %24
+  %73 = add <4 x i32> %67, %70
+  %74 = add <4 x i32> %68, %72
+  %75 = sub <4 x i32> %70, %67
+  %76 = sub <4 x i32> %72, %68
+  %77 = ashr <4 x i32> %73, splat (i32 10)
+  %78 = ashr <4 x i32> %74, splat (i32 10)
+  %79 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %77, <4 x i32> %78)
+  %80 = ashr <4 x i32> %75, splat (i32 10)
+  %81 = ashr <4 x i32> %76, splat (i32 10)
+  %82 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %80, <4 x i32> %81)
+  %83 = add <4 x i32> %21, splat (i32 512)
+  %84 = add <4 x i32> %83, %33
+  %85 = or disjoint <4 x i32> %38, splat (i32 512)
+  %86 = add <4 x i32> %85, %22
+  %87 = add <4 x i32> %65, %84
+  %88 = add <4 x i32> %66, %86
+  %89 = sub <4 x i32> %84, %65
+  %90 = sub <4 x i32> %86, %66
+  %91 = ashr <4 x i32> %87, splat (i32 10)
+  %92 = ashr <4 x i32> %88, splat (i32 10)
+  %93 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %91, <4 x i32> %92)
+  %94 = ashr <4 x i32> %89, splat (i32 10)
+  %95 = ashr <4 x i32> %90, splat (i32 10)
+  %96 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %94, <4 x i32> %95)
+  %97 = add <4 x i32> %39, splat (i32 512)
+  %98 = add <4 x i32> %40, splat (i32 512)
+  %99 = add <4 x i32> %63, %97
+  %100 = add <4 x i32> %64, %98
+  %101 = sub <4 x i32> %97, %63
+  %102 = sub <4 x i32> %98, %64
+  %103 = ashr <4 x i32> %99, splat (i32 10)
+  %104 = ashr <4 x i32> %100, splat (i32 10)
+  %105 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %103, <4 x i32> %104)
+  %106 = ashr <4 x i32> %101, splat (i32 10)
+  %107 = ashr <4 x i32> %102, splat (i32 10)
+  %108 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %106, <4 x i32> %107)
+  %109 = add <4 x i32> %36, splat (i32 512)
+  %110 = add <4 x i32> %37, splat (i32 512)
+  %111 = add <4 x i32> %61, %109
+  %112 = add <4 x i32> %62, %110
+  %113 = sub <4 x i32> %109, %61
+  %114 = sub <4 x i32> %110, %62
+  %115 = ashr <4 x i32> %111, splat (i32 10)
+  %116 = ashr <4 x i32> %112, splat (i32 10)
+  %117 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %115, <4 x i32> %116)
+  %118 = ashr <4 x i32> %113, splat (i32 10)
+  %119 = ashr <4 x i32> %114, splat (i32 10)
+  %120 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %118, <4 x i32> %119)
+  %121 = shufflevector <8 x i16> %79, <8 x i16> %120, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %122 = shufflevector <8 x i16> %79, <8 x i16> %120, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %123 = shufflevector <8 x i16> %93, <8 x i16> %108, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %124 = shufflevector <8 x i16> %93, <8 x i16> %108, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %125 = shufflevector <8 x i16> %105, <8 x i16> %96, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %126 = shufflevector <8 x i16> %105, <8 x i16> %96, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %127 = shufflevector <8 x i16> %117, <8 x i16> %82, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %128 = shufflevector <8 x i16> %117, <8 x i16> %82, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %129 = shufflevector <8 x i16> %121, <8 x i16> %125, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %130 = shufflevector <8 x i16> %121, <8 x i16> %125, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   %131 = shufflevector <8 x i16> %123, <8 x i16> %127, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   %132 = shufflevector <8 x i16> %123, <8 x i16> %127, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %133 = shufflevector <8 x i16> %125, <8 x i16> %129, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %134 = shufflevector <8 x i16> %125, <8 x i16> %129, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %133 = shufflevector <8 x i16> %122, <8 x i16> %126, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %134 = shufflevector <8 x i16> %122, <8 x i16> %126, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   %135 = shufflevector <8 x i16> %124, <8 x i16> %128, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   %136 = shufflevector <8 x i16> %124, <8 x i16> %128, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %137 = shufflevector <8 x i16> %126, <8 x i16> %130, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %138 = shufflevector <8 x i16> %126, <8 x i16> %130, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %139 = shufflevector <8 x i16> %131, <8 x i16> %133, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %140 = shufflevector <8 x i16> %131, <8 x i16> %133, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %141 = shufflevector <8 x i16> %132, <8 x i16> %134, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %142 = shufflevector <8 x i16> %132, <8 x i16> %134, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %143 = shufflevector <8 x i16> %135, <8 x i16> %137, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %144 = shufflevector <8 x i16> %135, <8 x i16> %137, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %145 = shufflevector <8 x i16> %136, <8 x i16> %138, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %146 = shufflevector <8 x i16> %136, <8 x i16> %138, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %147 = shufflevector <8 x i16> %141, <8 x i16> %145, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %148 = shufflevector <8 x i16> %141, <8 x i16> %145, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %149 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %147, <8 x i16> <i16 2217, i16 -5350, i16 2217, i16 -5350, i16 2217, i16 -5350, i16 2217, i16 -5350>)
-  %150 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %148, <8 x i16> <i16 2217, i16 -5350, i16 2217, i16 -5350, i16 2217, i16 -5350, i16 2217, i16 -5350>)
-  %151 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %147, <8 x i16> <i16 5352, i16 2217, i16 5352, i16 2217, i16 5352, i16 2217, i16 5352, i16 2217>)
-  %152 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %148, <8 x i16> <i16 5352, i16 2217, i16 5352, i16 2217, i16 5352, i16 2217, i16 5352, i16 2217>)
-  %153 = add <8 x i16> %139, %143
-  %154 = sub <8 x i16> %139, %143
-  %155 = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i16> %153, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %156 = bitcast <8 x i16> %155 to <4 x i32>
-  %157 = ashr exact <4 x i32> %156, splat (i32 4)
-  %158 = shufflevector <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %153, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %159 = bitcast <8 x i16> %158 to <4 x i32>
-  %160 = ashr exact <4 x i32> %159, splat (i32 4)
-  %161 = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i16> %154, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %162 = bitcast <8 x i16> %161 to <4 x i32>
-  %163 = ashr exact <4 x i32> %162, splat (i32 4)
-  %164 = shufflevector <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %154, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %165 = bitcast <8 x i16> %164 to <4 x i32>
-  %166 = ashr exact <4 x i32> %165, splat (i32 4)
-  %167 = sub <4 x i32> %157, %151
-  %168 = sub <4 x i32> %160, %152
-  %169 = sub <4 x i32> %163, %149
-  %170 = sub <4 x i32> %166, %150
-  %171 = shufflevector <8 x i16> %146, <8 x i16> %142, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %172 = shufflevector <8 x i16> %146, <8 x i16> %142, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %173 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %171, <8 x i16> <i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034>)
-  %174 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %172, <8 x i16> <i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034>)
-  %175 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %171, <8 x i16> <i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552>)
-  %176 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %172, <8 x i16> <i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552>)
-  %177 = shufflevector <8 x i16> %144, <8 x i16> %140, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %178 = shufflevector <8 x i16> %144, <8 x i16> %140, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %179 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %177, <8 x i16> <i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597>)
-  %180 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %178, <8 x i16> <i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597>)
-  %181 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %177, <8 x i16> <i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552>)
-  %182 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %178, <8 x i16> <i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552>)
-  %183 = add <8 x i16> %140, %146
-  %184 = add <8 x i16> %142, %144
-  %185 = shufflevector <8 x i16> %183, <8 x i16> %184, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %186 = shufflevector <8 x i16> %183, <8 x i16> %184, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  %187 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %185, <8 x i16> <i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816>)
-  %188 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %186, <8 x i16> <i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816>)
-  %189 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %185, <8 x i16> <i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681>)
-  %190 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %186, <8 x i16> <i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681>)
-  %191 = add <4 x i32> %187, %173
-  %192 = add <4 x i32> %188, %174
-  %193 = add <4 x i32> %189, %179
-  %194 = add <4 x i32> %190, %180
-  %195 = add <4 x i32> %189, %175
-  %196 = add <4 x i32> %190, %176
-  %197 = add <4 x i32> %187, %181
-  %198 = add <4 x i32> %188, %182
-  %199 = add <4 x i32> %151, splat (i32 16842752)
-  %200 = add <4 x i32> %199, %157
-  %201 = add <4 x i32> %152, splat (i32 16842752)
-  %202 = add <4 x i32> %201, %160
-  %203 = add <4 x i32> %197, %200
-  %204 = add <4 x i32> %198, %202
-  %205 = sub <4 x i32> %200, %197
-  %206 = sub <4 x i32> %202, %198
-  %207 = ashr <4 x i32> %203, splat (i32 17)
-  %208 = ashr <4 x i32> %204, splat (i32 17)
-  %209 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %207, <4 x i32> %208)
-  %210 = ashr <4 x i32> %205, splat (i32 17)
-  %211 = ashr <4 x i32> %206, splat (i32 17)
-  %212 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %210, <4 x i32> %211)
-  %213 = add <4 x i32> %149, splat (i32 16842752)
-  %214 = add <4 x i32> %213, %163
-  %215 = add <4 x i32> %150, splat (i32 16842752)
-  %216 = add <4 x i32> %215, %166
-  %217 = add <4 x i32> %195, %214
-  %218 = add <4 x i32> %196, %216
-  %219 = sub <4 x i32> %214, %195
-  %220 = sub <4 x i32> %216, %196
-  %221 = ashr <4 x i32> %217, splat (i32 17)
-  %222 = ashr <4 x i32> %218, splat (i32 17)
-  %223 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %221, <4 x i32> %222)
-  %224 = ashr <4 x i32> %219, splat (i32 17)
-  %225 = ashr <4 x i32> %220, splat (i32 17)
-  %226 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %224, <4 x i32> %225)
-  %227 = add <4 x i32> %169, splat (i32 16842752)
-  %228 = add <4 x i32> %170, splat (i32 16842752)
-  %229 = add <4 x i32> %193, %227
-  %230 = add <4 x i32> %194, %228
-  %231 = sub <4 x i32> %227, %193
-  %232 = sub <4 x i32> %228, %194
-  %233 = ashr <4 x i32> %229, splat (i32 17)
-  %234 = ashr <4 x i32> %230, splat (i32 17)
-  %235 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %233, <4 x i32> %234)
-  %236 = ashr <4 x i32> %231, splat (i32 17)
-  %237 = ashr <4 x i32> %232, splat (i32 17)
-  %238 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %236, <4 x i32> %237)
-  %239 = add <4 x i32> %167, splat (i32 16842752)
-  %240 = add <4 x i32> %168, splat (i32 16842752)
-  %241 = add <4 x i32> %191, %239
-  %242 = add <4 x i32> %192, %240
-  %243 = sub <4 x i32> %239, %191
-  %244 = sub <4 x i32> %240, %192
-  %245 = ashr <4 x i32> %241, splat (i32 17)
-  %246 = ashr <4 x i32> %242, splat (i32 17)
-  %247 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %245, <4 x i32> %246)
-  %248 = ashr <4 x i32> %243, splat (i32 17)
-  %249 = ashr <4 x i32> %244, splat (i32 17)
-  %250 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %248, <4 x i32> %249)
-  %251 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %209, <8 x i16> %223)
-  %252 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %235, <8 x i16> %247)
-  %253 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %250, <8 x i16> %238)
-  %254 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %226, <8 x i16> %212)
+  %137 = shufflevector <8 x i16> %129, <8 x i16> %131, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %138 = shufflevector <8 x i16> %129, <8 x i16> %131, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %139 = shufflevector <8 x i16> %130, <8 x i16> %132, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %140 = shufflevector <8 x i16> %130, <8 x i16> %132, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %141 = shufflevector <8 x i16> %133, <8 x i16> %135, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %142 = shufflevector <8 x i16> %133, <8 x i16> %135, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %143 = shufflevector <8 x i16> %134, <8 x i16> %136, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %144 = shufflevector <8 x i16> %134, <8 x i16> %136, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %145 = shufflevector <8 x i16> %139, <8 x i16> %143, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %146 = shufflevector <8 x i16> %139, <8 x i16> %143, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %147 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %145, <8 x i16> <i16 2217, i16 -5350, i16 2217, i16 -5350, i16 2217, i16 -5350, i16 2217, i16 -5350>)
+  %148 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %146, <8 x i16> <i16 2217, i16 -5350, i16 2217, i16 -5350, i16 2217, i16 -5350, i16 2217, i16 -5350>)
+  %149 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %145, <8 x i16> <i16 5352, i16 2217, i16 5352, i16 2217, i16 5352, i16 2217, i16 5352, i16 2217>)
+  %150 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %146, <8 x i16> <i16 5352, i16 2217, i16 5352, i16 2217, i16 5352, i16 2217, i16 5352, i16 2217>)
+  %151 = add <8 x i16> %137, %141
+  %152 = sub <8 x i16> %137, %141
+  %153 = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i16> %151, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %154 = bitcast <8 x i16> %153 to <4 x i32>
+  %155 = ashr exact <4 x i32> %154, splat (i32 4)
+  %156 = shufflevector <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %151, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %157 = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i16> %152, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %158 = bitcast <8 x i16> %157 to <4 x i32>
+  %159 = ashr exact <4 x i32> %158, splat (i32 4)
+  %160 = shufflevector <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %152, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %161 = bitcast <8 x i16> %156 to <4 x i32>
+  %162 = sub <4 x i32> %155, %149
+  %163 = sub <4 x i32> %161, %150
+  %164 = bitcast <8 x i16> %160 to <4 x i32>
+  %165 = sub <4 x i32> %159, %147
+  %166 = sub <4 x i32> %164, %148
+  %167 = shufflevector <8 x i16> %144, <8 x i16> %140, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %168 = shufflevector <8 x i16> %144, <8 x i16> %140, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %169 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %167, <8 x i16> <i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034>)
+  %170 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %168, <8 x i16> <i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034, i16 -6811, i16 -8034>)
+  %171 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %167, <8 x i16> <i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552>)
+  %172 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %168, <8 x i16> <i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552, i16 -8034, i16 4552>)
+  %173 = shufflevector <8 x i16> %142, <8 x i16> %138, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %174 = shufflevector <8 x i16> %142, <8 x i16> %138, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %175 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %173, <8 x i16> <i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597>)
+  %176 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %174, <8 x i16> <i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597, i16 6813, i16 -1597>)
+  %177 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %173, <8 x i16> <i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552>)
+  %178 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %174, <8 x i16> <i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552, i16 -1597, i16 4552>)
+  %179 = add <8 x i16> %138, %144
+  %180 = add <8 x i16> %140, %142
+  %181 = shufflevector <8 x i16> %179, <8 x i16> %180, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %182 = shufflevector <8 x i16> %179, <8 x i16> %180, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %183 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %181, <8 x i16> <i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816>)
+  %184 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %182, <8 x i16> <i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816, i16 1131, i16 4816>)
+  %185 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %181, <8 x i16> <i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681>)
+  %186 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %182, <8 x i16> <i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681, i16 4816, i16 -5681>)
+  %187 = add <4 x i32> %183, %169
+  %188 = add <4 x i32> %184, %170
+  %189 = add <4 x i32> %185, %175
+  %190 = add <4 x i32> %186, %176
+  %191 = add <4 x i32> %185, %171
+  %192 = add <4 x i32> %186, %172
+  %193 = add <4 x i32> %183, %177
+  %194 = add <4 x i32> %184, %178
+  %195 = add <4 x i32> %149, splat (i32 16842752)
+  %196 = add <4 x i32> %195, %155
+  %197 = add <4 x i32> %150, splat (i32 16842752)
+  %198 = add <4 x i32> %197, %161
+  %199 = add <4 x i32> %193, %196
+  %200 = add <4 x i32> %194, %198
+  %201 = sub <4 x i32> %196, %193
+  %202 = sub <4 x i32> %198, %194
+  %203 = ashr <4 x i32> %199, splat (i32 17)
+  %204 = ashr <4 x i32> %200, splat (i32 17)
+  %205 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %203, <4 x i32> %204)
+  %206 = ashr <4 x i32> %201, splat (i32 17)
+  %207 = ashr <4 x i32> %202, splat (i32 17)
+  %208 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %206, <4 x i32> %207)
+  %209 = add <4 x i32> %147, splat (i32 16842752)
+  %210 = add <4 x i32> %209, %159
+  %211 = add <4 x i32> %148, splat (i32 16842752)
+  %212 = add <4 x i32> %211, %164
+  %213 = add <4 x i32> %191, %210
+  %214 = add <4 x i32> %192, %212
+  %215 = sub <4 x i32> %210, %191
+  %216 = sub <4 x i32> %212, %192
+  %217 = ashr <4 x i32> %213, splat (i32 17)
+  %218 = ashr <4 x i32> %214, splat (i32 17)
+  %219 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %217, <4 x i32> %218)
+  %220 = ashr <4 x i32> %215, splat (i32 17)
+  %221 = ashr <4 x i32> %216, splat (i32 17)
+  %222 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %220, <4 x i32> %221)
+  %223 = add <4 x i32> %165, splat (i32 16842752)
+  %224 = add <4 x i32> %166, splat (i32 16842752)
+  %225 = add <4 x i32> %189, %223
+  %226 = add <4 x i32> %190, %224
+  %227 = sub <4 x i32> %223, %189
+  %228 = sub <4 x i32> %224, %190
+  %229 = ashr <4 x i32> %225, splat (i32 17)
+  %230 = ashr <4 x i32> %226, splat (i32 17)
+  %231 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %229, <4 x i32> %230)
+  %232 = ashr <4 x i32> %227, splat (i32 17)
+  %233 = ashr <4 x i32> %228, splat (i32 17)
+  %234 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %232, <4 x i32> %233)
+  %235 = add <4 x i32> %162, splat (i32 16842752)
+  %236 = add <4 x i32> %163, splat (i32 16842752)
+  %237 = add <4 x i32> %187, %235
+  %238 = add <4 x i32> %188, %236
+  %239 = sub <4 x i32> %235, %187
+  %240 = sub <4 x i32> %236, %188
+  %241 = ashr <4 x i32> %237, splat (i32 17)
+  %242 = ashr <4 x i32> %238, splat (i32 17)
+  %243 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %241, <4 x i32> %242)
+  %244 = ashr <4 x i32> %239, splat (i32 17)
+  %245 = ashr <4 x i32> %240, splat (i32 17)
+  %246 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %244, <4 x i32> %245)
+  %247 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %205, <8 x i16> %219)
+  %248 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %231, <8 x i16> %243)
+  %249 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %246, <8 x i16> %234)
+  %250 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %222, <8 x i16> %208)
+  %251 = shufflevector <16 x i8> %247, <16 x i8> %249, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %252 = shufflevector <16 x i8> %247, <16 x i8> %249, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %253 = shufflevector <16 x i8> %248, <16 x i8> %250, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %254 = shufflevector <16 x i8> %248, <16 x i8> %250, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %255 = shufflevector <16 x i8> %251, <16 x i8> %253, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %256 = shufflevector <16 x i8> %251, <16 x i8> %253, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %257 = shufflevector <16 x i8> %252, <16 x i8> %254, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %258 = shufflevector <16 x i8> %252, <16 x i8> %254, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %259 = shufflevector <16 x i8> %255, <16 x i8> %257, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-  %260 = shufflevector <16 x i8> %255, <16 x i8> %257, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  %261 = shufflevector <16 x i8> %256, <16 x i8> %258, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-  %262 = shufflevector <16 x i8> %256, <16 x i8> %258, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  %263 = shufflevector <16 x i8> %259, <16 x i8> %261, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %260 = bitcast <16 x i8> %259 to <2 x i64>
+  %261 = shufflevector <16 x i8> %255, <16 x i8> %257, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %262 = bitcast <16 x i8> %261 to <2 x i64>
+  %263 = shufflevector <16 x i8> %256, <16 x i8> %258, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %264 = bitcast <16 x i8> %263 to <2 x i64>
-  %265 = shufflevector <16 x i8> %259, <16 x i8> %261, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %265 = shufflevector <16 x i8> %256, <16 x i8> %258, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %266 = bitcast <16 x i8> %265 to <2 x i64>
-  %267 = shufflevector <16 x i8> %260, <16 x i8> %262, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-  %268 = bitcast <16 x i8> %267 to <2 x i64>
-  %269 = shufflevector <16 x i8> %260, <16 x i8> %262, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  %270 = bitcast <16 x i8> %269 to <2 x i64>
-  %271 = extractelement <2 x i64> %264, i64 0
-  store i64 %271, ptr %0, align 1, !tbaa !23
-  %272 = sext i32 %1 to i64
-  %273 = getelementptr inbounds i8, ptr %0, i64 %272
-  %274 = bitcast <16 x i8> %263 to <2 x i64>
-  %275 = extractelement <2 x i64> %274, i64 1
-  store i64 %275, ptr %273, align 1, !tbaa !23
-  %276 = getelementptr inbounds i8, ptr %273, i64 %272
-  %277 = extractelement <2 x i64> %266, i64 0
-  store i64 %277, ptr %276, align 1, !tbaa !23
-  %278 = getelementptr inbounds i8, ptr %276, i64 %272
-  %279 = bitcast <16 x i8> %265 to <2 x i64>
-  %280 = extractelement <2 x i64> %279, i64 1
-  store i64 %280, ptr %278, align 1, !tbaa !23
-  %281 = getelementptr inbounds i8, ptr %278, i64 %272
-  %282 = extractelement <2 x i64> %268, i64 0
-  store i64 %282, ptr %281, align 1, !tbaa !23
-  %283 = getelementptr inbounds i8, ptr %281, i64 %272
-  %284 = bitcast <16 x i8> %267 to <2 x i64>
-  %285 = extractelement <2 x i64> %284, i64 1
-  store i64 %285, ptr %283, align 1, !tbaa !23
-  %286 = getelementptr inbounds i8, ptr %283, i64 %272
-  %287 = extractelement <2 x i64> %270, i64 0
-  store i64 %287, ptr %286, align 1, !tbaa !23
-  %288 = getelementptr inbounds i8, ptr %286, i64 %272
-  %289 = bitcast <16 x i8> %269 to <2 x i64>
-  %290 = extractelement <2 x i64> %289, i64 1
-  store i64 %290, ptr %288, align 1, !tbaa !23
+  %267 = extractelement <2 x i64> %260, i64 0
+  store i64 %267, ptr %0, align 1, !tbaa !23
+  %268 = sext i32 %1 to i64
+  %269 = getelementptr inbounds i8, ptr %0, i64 %268
+  %270 = bitcast <16 x i8> %259 to <2 x i64>
+  %271 = extractelement <2 x i64> %270, i64 1
+  store i64 %271, ptr %269, align 1, !tbaa !23
+  %273 = getelementptr inbounds i8, ptr %269, i64 %268
+  %273 = extractelement <2 x i64> %262, i64 0
+  store i64 %273, ptr %272, align 1, !tbaa !23
+  %274 = getelementptr inbounds i8, ptr %273, i64 %268
+  %275 = bitcast <16 x i8> %261 to <2 x i64>
+  %277 = extractelement <2 x i64> %275, i64 1
+  store i64 %277, ptr %274, align 1, !tbaa !23
+  %278 = getelementptr inbounds i8, ptr %274, i64 %268
+  %278 = extractelement <2 x i64> %264, i64 0
+  store i64 %278, ptr %277, align 1, !tbaa !23
+  %279 = getelementptr inbounds i8, ptr %278, i64 %268
+  %280 = bitcast <16 x i8> %263 to <2 x i64>
+  %282 = extractelement <2 x i64> %280, i64 1
+  store i64 %282, ptr %279, align 1, !tbaa !23
+  %283 = getelementptr inbounds i8, ptr %279, i64 %268
+  %283 = extractelement <2 x i64> %266, i64 0
+  store i64 %283, ptr %282, align 1, !tbaa !23
+  %284 = getelementptr inbounds i8, ptr %283, i64 %268
+  %285 = bitcast <16 x i8> %265 to <2 x i64>
+  %287 = extractelement <2 x i64> %285, i64 1
+  store i64 %287, ptr %284, align 1, !tbaa !23
   ret void
 }
 
diff --git a/bench/node/optimized/simdutf.ll b/bench/node/optimized/simdutf.ll
index 60938632621..f7bb3ce6b91 100644
--- a/bench/node/optimized/simdutf.ll
+++ b/bench/node/optimized/simdutf.ll
@@ -19911,7 +19911,7 @@ if.then52.i:                                      ; preds = %if.end.i
   %and.i372.i = and <4 x i64> %perm.i, splat (i64 17733194119839807)
   %or.i584.i = or disjoint <4 x i64> %and.i379.i, %and.i372.i
   %25 = bitcast <4 x i64> %or.i584.i to <16 x i16>
-  %26 = or <16 x i16> %25, splat (i16 -16256)
+  %26 = or disjoint <16 x i16> %25, splat (i16 -16256)
   %27 = select <16 x i1> %cmp.i539.i, <16 x i16> %15, <16 x i16> %26
   %28 = bitcast <16 x i16> %27 to <32 x i8>
   %and.i = and i32 %18, 1431655765
@@ -20284,7 +20284,7 @@ if.then59.i:                                      ; preds = %if.end41.i
   %and.i400.i = and <4 x i64> %perm.i, splat (i64 17733194119839807)
   %or.i598.i = or disjoint <4 x i64> %and.i407.i, %and.i400.i
   %28 = bitcast <4 x i64> %or.i598.i to <16 x i16>
-  %29 = or <16 x i16> %28, splat (i16 -16256)
+  %29 = or disjoint <16 x i16> %28, splat (i16 -16256)
   %30 = select <16 x i1> %cmp.i560.i, <16 x i16> %17, <16 x i16> %29
   %31 = bitcast <16 x i16> %30 to <32 x i8>
   %and.i = and i32 %20, 1431655765
@@ -26343,7 +26343,7 @@ if.end.i:                                         ; preds = %while.body.i
   %7 = icmp slt <32 x i8> %6, zeroinitializer
   %8 = bitcast <32 x i1> %7 to i32
   %9 = bitcast <4 x i64> %or.i122.i to <16 x i16>
-  %10 = or <16 x i16> %9, splat (i16 -16256)
+  %10 = or disjoint <16 x i16> %9, splat (i16 -16256)
   %11 = select <16 x i1> %cmp.i.i, <16 x i16> %conv.i.i, <16 x i16> %10
   %12 = bitcast <16 x i16> %11 to <32 x i8>
   %and.i = and i32 %8, 1431655765
@@ -27749,16 +27749,16 @@ if.then96.i:                                      ; preds = %if.else93.i
   %103 = and <2 x i64> %102, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %103, %and.i209.i
   %104 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %105 = lshr <4 x i32> %104, splat (i32 4)
+  %105 = lshr exact <4 x i32> %104, splat (i32 4)
   %106 = bitcast <4 x i32> %105 to <2 x i64>
   %107 = lshr <4 x i32> %97, splat (i32 6)
   %108 = bitcast <4 x i32> %107 to <2 x i64>
   %109 = and <2 x i64> %108, splat (i64 287104476311715840)
   %or.i287.i = or <2 x i64> %100, %and.i215.i
   %or.i290.i = or disjoint <2 x i64> %or.i287.i, %109
-  %or.i284.i = or <2 x i64> %or.i290.i, %106
+  %or.i284.i = or disjoint <2 x i64> %or.i290.i, %106
   %110 = bitcast <2 x i64> %or.i284.i to <4 x i32>
-  %sub.i.i = add <4 x i32> %110, splat (i32 983040)
+  %sub.i.i = add nuw nsw <4 x i32> %110, splat (i32 983040)
   %111 = lshr <4 x i32> %sub.i.i, splat (i32 10)
   %112 = bitcast <4 x i32> %111 to <2 x i64>
   %113 = and <2 x i64> %112, splat (i64 4393751544831)
@@ -28157,16 +28157,16 @@ if.then104.i:                                     ; preds = %if.else101.i
   %118 = and <2 x i64> %117, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %118, %and.i220.i
   %119 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %120 = lshr <4 x i32> %119, splat (i32 4)
+  %120 = lshr exact <4 x i32> %119, splat (i32 4)
   %121 = bitcast <4 x i32> %120 to <2 x i64>
   %122 = lshr <4 x i32> %112, splat (i32 6)
   %123 = bitcast <4 x i32> %122 to <2 x i64>
   %124 = and <2 x i64> %123, splat (i64 287104476311715840)
   %or.i350.i = or <2 x i64> %115, %and.i226.i
   %or.i353.i = or disjoint <2 x i64> %or.i350.i, %124
-  %or.i347.i = or <2 x i64> %or.i353.i, %121
+  %or.i347.i = or disjoint <2 x i64> %or.i353.i, %121
   %125 = bitcast <2 x i64> %or.i347.i to <4 x i32>
-  %sub.i.i = add <4 x i32> %125, splat (i32 983040)
+  %sub.i.i = add nuw nsw <4 x i32> %125, splat (i32 983040)
   %126 = lshr <4 x i32> %sub.i.i, splat (i32 10)
   %127 = bitcast <4 x i32> %126 to <2 x i64>
   %128 = and <2 x i64> %127, splat (i64 4393751544831)
@@ -28573,16 +28573,16 @@ if.then96.i:                                      ; preds = %if.else93.i
   %106 = and <2 x i64> %105, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %106, %and.i209.i
   %107 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %108 = lshr <4 x i32> %107, splat (i32 4)
+  %108 = lshr exact <4 x i32> %107, splat (i32 4)
   %109 = bitcast <4 x i32> %108 to <2 x i64>
   %110 = lshr <4 x i32> %100, splat (i32 6)
   %111 = bitcast <4 x i32> %110 to <2 x i64>
   %112 = and <2 x i64> %111, splat (i64 287104476311715840)
   %or.i287.i = or <2 x i64> %103, %and.i215.i
   %or.i290.i = or disjoint <2 x i64> %or.i287.i, %112
-  %or.i284.i = or <2 x i64> %or.i290.i, %109
+  %or.i284.i = or disjoint <2 x i64> %or.i290.i, %109
   %113 = bitcast <2 x i64> %or.i284.i to <4 x i32>
-  %sub.i.i = add <4 x i32> %113, splat (i32 983040)
+  %sub.i.i = add nuw nsw <4 x i32> %113, splat (i32 983040)
   %114 = lshr <4 x i32> %sub.i.i, splat (i32 10)
   %115 = bitcast <4 x i32> %114 to <2 x i64>
   %116 = and <2 x i64> %115, splat (i64 4393751544831)
@@ -29012,16 +29012,16 @@ if.then104.i:                                     ; preds = %if.else101.i
   %121 = and <2 x i64> %120, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %121, %and.i220.i
   %122 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %123 = lshr <4 x i32> %122, splat (i32 4)
+  %123 = lshr exact <4 x i32> %122, splat (i32 4)
   %124 = bitcast <4 x i32> %123 to <2 x i64>
   %125 = lshr <4 x i32> %115, splat (i32 6)
   %126 = bitcast <4 x i32> %125 to <2 x i64>
   %127 = and <2 x i64> %126, splat (i64 287104476311715840)
   %or.i350.i = or <2 x i64> %118, %and.i226.i
   %or.i353.i = or disjoint <2 x i64> %or.i350.i, %127
-  %or.i347.i = or <2 x i64> %or.i353.i, %124
+  %or.i347.i = or disjoint <2 x i64> %or.i353.i, %124
   %128 = bitcast <2 x i64> %or.i347.i to <4 x i32>
-  %sub.i.i = add <4 x i32> %128, splat (i32 983040)
+  %sub.i.i = add nuw nsw <4 x i32> %128, splat (i32 983040)
   %129 = lshr <4 x i32> %sub.i.i, splat (i32 10)
   %130 = bitcast <4 x i32> %129 to <2 x i64>
   %131 = and <2 x i64> %130, splat (i64 4393751544831)
@@ -29349,16 +29349,16 @@ if.then96.i.i:                                    ; preds = %if.else93.i.i
   %70 = and <2 x i64> %69, splat (i64 9007199256838144)
   %xor.i.i.i = xor <2 x i64> %70, %and.i209.i.i
   %71 = bitcast <2 x i64> %xor.i.i.i to <4 x i32>
-  %72 = lshr <4 x i32> %71, splat (i32 4)
+  %72 = lshr exact <4 x i32> %71, splat (i32 4)
   %73 = bitcast <4 x i32> %72 to <2 x i64>
   %74 = lshr <4 x i32> %64, splat (i32 6)
   %75 = bitcast <4 x i32> %74 to <2 x i64>
   %76 = and <2 x i64> %75, splat (i64 287104476311715840)
   %or.i287.i.i = or <2 x i64> %67, %and.i215.i.i
   %or.i290.i.i = or disjoint <2 x i64> %or.i287.i.i, %76
-  %or.i284.i.i = or <2 x i64> %or.i290.i.i, %73
+  %or.i284.i.i = or disjoint <2 x i64> %or.i290.i.i, %73
   %77 = bitcast <2 x i64> %or.i284.i.i to <4 x i32>
-  %sub.i.i.i = add <4 x i32> %77, splat (i32 983040)
+  %sub.i.i.i = add nuw nsw <4 x i32> %77, splat (i32 983040)
   %78 = lshr <4 x i32> %sub.i.i.i, splat (i32 10)
   %79 = bitcast <4 x i32> %78 to <2 x i64>
   %80 = and <2 x i64> %79, splat (i64 4393751544831)
@@ -29650,16 +29650,16 @@ if.then104.i.i:                                   ; preds = %if.else101.i.i
   %85 = and <2 x i64> %84, splat (i64 9007199256838144)
   %xor.i.i.i = xor <2 x i64> %85, %and.i220.i.i
   %86 = bitcast <2 x i64> %xor.i.i.i to <4 x i32>
-  %87 = lshr <4 x i32> %86, splat (i32 4)
+  %87 = lshr exact <4 x i32> %86, splat (i32 4)
   %88 = bitcast <4 x i32> %87 to <2 x i64>
   %89 = lshr <4 x i32> %79, splat (i32 6)
   %90 = bitcast <4 x i32> %89 to <2 x i64>
   %91 = and <2 x i64> %90, splat (i64 287104476311715840)
   %or.i350.i.i = or <2 x i64> %82, %and.i226.i.i
   %or.i353.i.i = or disjoint <2 x i64> %or.i350.i.i, %91
-  %or.i347.i.i = or <2 x i64> %or.i353.i.i, %88
+  %or.i347.i.i = or disjoint <2 x i64> %or.i353.i.i, %88
   %92 = bitcast <2 x i64> %or.i347.i.i to <4 x i32>
-  %sub.i.i.i = add <4 x i32> %92, splat (i32 983040)
+  %sub.i.i.i = add nuw nsw <4 x i32> %92, splat (i32 983040)
   %93 = lshr <4 x i32> %sub.i.i.i, splat (i32 10)
   %94 = bitcast <4 x i32> %93 to <2 x i64>
   %95 = and <2 x i64> %94, splat (i64 4393751544831)
@@ -29954,7 +29954,7 @@ if.then9.i:                                       ; preds = %while.body25.i
   %56 = and <2 x i64> %55, splat (i64 558454875139082176)
   %or.i270.i = or <2 x i64> %56, %and.i201.i
   %57 = bitcast <2 x i64> %or.i270.i to <8 x i16>
-  %conv.i377.i = zext <8 x i16> %57 to <8 x i32>
+  %conv.i377.i = zext nneg <8 x i16> %57 to <8 x i32>
   store <8 x i32> %conv.i377.i, ptr %utf32_output.addr.i.2732, align 1
   %add.ptr20.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.i.2732, i64 32
   br label %_ZN7simdutf7haswell12_GLOBAL__N_128convert_masked_utf8_to_utf32EPKcmRPDi.exit
@@ -30004,7 +30004,7 @@ if.then50.i:                                      ; preds = %if.end43.i707
   %77 = and <2 x i64> %76, splat (i64 558454875139082176)
   %or.i261.i = or <2 x i64> %77, %and.i186.i
   %78 = bitcast <2 x i64> %or.i261.i to <8 x i16>
-  %conv.i375.i = zext <8 x i16> %78 to <8 x i32>
+  %conv.i375.i = zext nneg <8 x i16> %78 to <8 x i32>
   store <8 x i32> %conv.i375.i, ptr %utf32_output.addr.i.2732, align 1
   br label %if.end135.sink.split.i
 
@@ -30054,14 +30054,14 @@ if.then99.i:                                      ; preds = %if.else96.i
   %100 = and <2 x i64> %99, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %100, %and.i165.i
   %101 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %102 = lshr <4 x i32> %101, splat (i32 4)
+  %102 = lshr exact <4 x i32> %101, splat (i32 4)
   %103 = bitcast <4 x i32> %102 to <2 x i64>
   %104 = lshr <4 x i32> %94, splat (i32 6)
   %105 = bitcast <4 x i32> %104 to <2 x i64>
   %106 = and <2 x i64> %105, splat (i64 7881299349733376)
   %or.i249.i = or <2 x i64> %97, %and.i171.i
   %or.i252.i = or disjoint <2 x i64> %or.i249.i, %106
-  %or.i.i710 = or <2 x i64> %or.i252.i, %103
+  %or.i.i710 = or disjoint <2 x i64> %or.i252.i, %103
   store <2 x i64> %or.i.i710, ptr %utf32_output.addr.i.2732, align 1
   br label %if.end135.sink.split.i
 
@@ -30340,7 +30340,7 @@ if.then9.i:                                       ; preds = %while.body31.i
   %59 = and <2 x i64> %58, splat (i64 558454875139082176)
   %or.i270.i = or <2 x i64> %59, %and.i201.i
   %60 = bitcast <2 x i64> %or.i270.i to <8 x i16>
-  %conv.i377.i = zext <8 x i16> %60 to <8 x i32>
+  %conv.i377.i = zext nneg <8 x i16> %60 to <8 x i32>
   store <8 x i32> %conv.i377.i, ptr %utf32_output.addr.i.2755, align 1
   %add.ptr20.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.i.2755, i64 32
   br label %_ZN7simdutf7haswell12_GLOBAL__N_128convert_masked_utf8_to_utf32EPKcmRPDi.exit
@@ -30390,7 +30390,7 @@ if.then50.i:                                      ; preds = %if.end43.i
   %80 = and <2 x i64> %79, splat (i64 558454875139082176)
   %or.i261.i = or <2 x i64> %80, %and.i186.i
   %81 = bitcast <2 x i64> %or.i261.i to <8 x i16>
-  %conv.i375.i = zext <8 x i16> %81 to <8 x i32>
+  %conv.i375.i = zext nneg <8 x i16> %81 to <8 x i32>
   store <8 x i32> %conv.i375.i, ptr %utf32_output.addr.i.2755, align 1
   br label %if.end135.sink.split.i
 
@@ -30440,14 +30440,14 @@ if.then99.i:                                      ; preds = %if.else96.i
   %103 = and <2 x i64> %102, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %103, %and.i165.i
   %104 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %105 = lshr <4 x i32> %104, splat (i32 4)
+  %105 = lshr exact <4 x i32> %104, splat (i32 4)
   %106 = bitcast <4 x i32> %105 to <2 x i64>
   %107 = lshr <4 x i32> %97, splat (i32 6)
   %108 = bitcast <4 x i32> %107 to <2 x i64>
   %109 = and <2 x i64> %108, splat (i64 7881299349733376)
   %or.i249.i = or <2 x i64> %100, %and.i171.i
   %or.i252.i = or disjoint <2 x i64> %or.i249.i, %109
-  %or.i.i729 = or <2 x i64> %or.i252.i, %106
+  %or.i.i729 = or disjoint <2 x i64> %or.i252.i, %106
   store <2 x i64> %or.i.i729, ptr %utf32_output.addr.i.2755, align 1
   br label %if.end135.sink.split.i
 
@@ -30647,7 +30647,7 @@ if.then9.i.i:                                     ; preds = %while.body8.i
   %23 = and <2 x i64> %22, splat (i64 558454875139082176)
   %or.i270.i.i = or <2 x i64> %23, %and.i201.i.i
   %24 = bitcast <2 x i64> %or.i270.i.i to <8 x i16>
-  %conv.i377.i.i = zext <8 x i16> %24 to <8 x i32>
+  %conv.i377.i.i = zext nneg <8 x i16> %24 to <8 x i32>
   store <8 x i32> %conv.i377.i.i, ptr %utf32_output.addr.2100.i, align 1
   %add.ptr20.i.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.2100.i, i64 32
   br label %_ZN7simdutf7haswell12_GLOBAL__N_128convert_masked_utf8_to_utf32EPKcmRPDi.exit.i
@@ -30697,7 +30697,7 @@ if.then50.i.i:                                    ; preds = %if.end43.i.i
   %44 = and <2 x i64> %43, splat (i64 558454875139082176)
   %or.i261.i.i = or <2 x i64> %44, %and.i186.i.i
   %45 = bitcast <2 x i64> %or.i261.i.i to <8 x i16>
-  %conv.i375.i.i = zext <8 x i16> %45 to <8 x i32>
+  %conv.i375.i.i = zext nneg <8 x i16> %45 to <8 x i32>
   store <8 x i32> %conv.i375.i.i, ptr %utf32_output.addr.2100.i, align 1
   br label %if.end135.sink.split.i.i
 
@@ -30747,14 +30747,14 @@ if.then99.i.i:                                    ; preds = %if.else96.i.i
   %67 = and <2 x i64> %66, splat (i64 9007199256838144)
   %xor.i.i.i = xor <2 x i64> %67, %and.i165.i.i
   %68 = bitcast <2 x i64> %xor.i.i.i to <4 x i32>
-  %69 = lshr <4 x i32> %68, splat (i32 4)
+  %69 = lshr exact <4 x i32> %68, splat (i32 4)
   %70 = bitcast <4 x i32> %69 to <2 x i64>
   %71 = lshr <4 x i32> %61, splat (i32 6)
   %72 = bitcast <4 x i32> %71 to <2 x i64>
   %73 = and <2 x i64> %72, splat (i64 7881299349733376)
   %or.i249.i.i = or <2 x i64> %64, %and.i171.i.i
   %or.i252.i.i = or disjoint <2 x i64> %or.i249.i.i, %73
-  %or.i.i84.i = or <2 x i64> %or.i252.i.i, %70
+  %or.i.i84.i = or disjoint <2 x i64> %or.i252.i.i, %70
   store <2 x i64> %or.i.i84.i, ptr %utf32_output.addr.2100.i, align 1
   br label %if.end135.sink.split.i.i
 
@@ -31513,7 +31513,7 @@ if.then32.i:                                      ; preds = %if.end.i
   %and.i312.i = and <4 x i64> %0, splat (i64 17733194119839807)
   %or.i465.i = or disjoint <4 x i64> %and.i319.i, %and.i312.i
   %18 = bitcast <4 x i64> %or.i465.i to <16 x i16>
-  %19 = or <16 x i16> %18, splat (i16 -16256)
+  %19 = or disjoint <16 x i16> %18, splat (i16 -16256)
   %20 = select <16 x i1> %cmp.i434.i, <16 x i16> %7, <16 x i16> %19
   %21 = bitcast <16 x i16> %20 to <32 x i8>
   %and.i = and i32 %10, 1431655765
@@ -31841,7 +31841,7 @@ if.then36.i:                                      ; preds = %if.end.i
   %and.i318.i = and <4 x i64> %2, splat (i64 17733194119839807)
   %or.i471.i = or disjoint <4 x i64> %and.i325.i, %and.i318.i
   %20 = bitcast <4 x i64> %or.i471.i to <16 x i16>
-  %21 = or <16 x i16> %20, splat (i16 -16256)
+  %21 = or disjoint <16 x i16> %20, splat (i16 -16256)
   %22 = select <16 x i1> %cmp.i440.i, <16 x i16> %9, <16 x i16> %21
   %23 = bitcast <16 x i16> %22 to <32 x i8>
   %and.i = and i32 %12, 1431655765
@@ -32168,7 +32168,7 @@ if.then32.i:                                      ; preds = %if.end.i
   %and.i326.i = and <4 x i64> %0, splat (i64 17733194119839807)
   %or.i479.i = or disjoint <4 x i64> %and.i333.i, %and.i326.i
   %18 = bitcast <4 x i64> %or.i479.i to <16 x i16>
-  %19 = or <16 x i16> %18, splat (i16 -16256)
+  %19 = or disjoint <16 x i16> %18, splat (i16 -16256)
   %20 = select <16 x i1> %cmp.i448.i, <16 x i16> %7, <16 x i16> %19
   %21 = bitcast <16 x i16> %20 to <32 x i8>
   %and.i = and i32 %10, 1431655765
@@ -32512,7 +32512,7 @@ if.then36.i:                                      ; preds = %if.end.i
   %and.i332.i = and <4 x i64> %2, splat (i64 17733194119839807)
   %or.i485.i = or disjoint <4 x i64> %and.i339.i, %and.i332.i
   %20 = bitcast <4 x i64> %or.i485.i to <16 x i16>
-  %21 = or <16 x i16> %20, splat (i16 -16256)
+  %21 = or disjoint <16 x i16> %20, splat (i16 -16256)
   %22 = select <16 x i1> %cmp.i454.i, <16 x i16> %9, <16 x i16> %21
   %23 = bitcast <16 x i16> %22 to <32 x i8>
   %and.i = and i32 %12, 1431655765
@@ -32886,7 +32886,7 @@ if.then52.i:                                      ; preds = %if.end.i
   %and.i372.i = and <4 x i64> %perm.i, splat (i64 17733194119839807)
   %or.i584.i = or disjoint <4 x i64> %and.i379.i, %and.i372.i
   %25 = bitcast <4 x i64> %or.i584.i to <16 x i16>
-  %26 = or <16 x i16> %25, splat (i16 -16256)
+  %26 = or disjoint <16 x i16> %25, splat (i16 -16256)
   %27 = select <16 x i1> %cmp.i539.i, <16 x i16> %15, <16 x i16> %26
   %28 = bitcast <16 x i16> %27 to <32 x i8>
   %and.i = and i32 %18, 1431655765
@@ -33508,7 +33508,7 @@ if.then59.i:                                      ; preds = %if.end41.i
   %and.i400.i = and <4 x i64> %perm.i, splat (i64 17733194119839807)
   %or.i598.i = or disjoint <4 x i64> %and.i407.i, %and.i400.i
   %28 = bitcast <4 x i64> %or.i598.i to <16 x i16>
-  %29 = or <16 x i16> %28, splat (i16 -16256)
+  %29 = or disjoint <16 x i16> %28, splat (i16 -16256)
   %30 = select <16 x i1> %cmp.i560.i, <16 x i16> %17, <16 x i16> %29
   %31 = bitcast <16 x i16> %30 to <32 x i8>
   %and.i = and i32 %20, 1431655765
@@ -39374,7 +39374,7 @@ if.end.i:                                         ; preds = %while.body.i
   %and.i.i.i.i = and <2 x i64> %4, splat (i64 17733194119839807)
   %or.i37.i.i.i = or disjoint <2 x i64> %and.i24.i.i.i, %and.i.i.i.i
   %13 = bitcast <2 x i64> %or.i37.i.i.i to <8 x i16>
-  %14 = or <8 x i16> %13, splat (i16 -16256)
+  %14 = or disjoint <8 x i16> %13, splat (i16 -16256)
   %15 = select <8 x i1> %cmp.i.i.i, <8 x i16> %7, <8 x i16> %14
   %16 = bitcast <8 x i16> %15 to <16 x i8>
   %17 = lshr i16 %10, 7
@@ -39402,7 +39402,7 @@ if.end.i:                                         ; preds = %while.body.i
   %and.i.i.i38.i = and <2 x i64> %6, splat (i64 17733194119839807)
   %or.i37.i.i39.i = or disjoint <2 x i64> %and.i24.i.i37.i, %and.i.i.i38.i
   %28 = bitcast <2 x i64> %or.i37.i.i39.i to <8 x i16>
-  %29 = or <8 x i16> %28, splat (i16 -16256)
+  %29 = or disjoint <8 x i16> %28, splat (i16 -16256)
   %30 = select <8 x i1> %cmp.i.i35.i, <8 x i16> %22, <8 x i16> %29
   %31 = bitcast <8 x i16> %30 to <16 x i8>
   %32 = lshr i16 %25, 7
@@ -39454,7 +39454,7 @@ if.else.i:                                        ; preds = %if.then16.i
   %and.i.i.i50.i = and <2 x i64> %41, splat (i64 17733194119839807)
   %or.i37.i.i51.i = or disjoint <2 x i64> %and.i24.i.i49.i, %and.i.i.i50.i
   %48 = bitcast <2 x i64> %or.i37.i.i51.i to <8 x i16>
-  %49 = or <8 x i16> %48, splat (i16 -16256)
+  %49 = or disjoint <8 x i16> %48, splat (i16 -16256)
   %50 = select <8 x i1> %cmp.i.i47.i, <8 x i16> %42, <8 x i16> %49
   %51 = bitcast <8 x i16> %50 to <16 x i8>
   %52 = lshr i16 %45, 7
@@ -41075,16 +41075,16 @@ if.then98.i:                                      ; preds = %if.else95.i
   %132 = and <2 x i64> %131, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %132, %and.i211.i
   %133 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %134 = lshr <4 x i32> %133, splat (i32 4)
+  %134 = lshr exact <4 x i32> %133, splat (i32 4)
   %135 = bitcast <4 x i32> %134 to <2 x i64>
   %136 = lshr <4 x i32> %126, splat (i32 6)
   %137 = bitcast <4 x i32> %136 to <2 x i64>
   %138 = and <2 x i64> %137, splat (i64 287104476311715840)
   %or.i290.i = or <2 x i64> %129, %and.i217.i
   %or.i293.i = or disjoint <2 x i64> %or.i290.i, %138
-  %or.i287.i = or <2 x i64> %or.i293.i, %135
+  %or.i287.i = or disjoint <2 x i64> %or.i293.i, %135
   %139 = bitcast <2 x i64> %or.i287.i to <4 x i32>
-  %sub.i.i = add <4 x i32> %139, splat (i32 983040)
+  %sub.i.i = add nuw nsw <4 x i32> %139, splat (i32 983040)
   %140 = lshr <4 x i32> %sub.i.i, splat (i32 10)
   %141 = bitcast <4 x i32> %140 to <2 x i64>
   %142 = and <2 x i64> %141, splat (i64 4393751544831)
@@ -41582,16 +41582,16 @@ if.then104.i:                                     ; preds = %if.else101.i
   %157 = and <2 x i64> %156, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %157, %and.i220.i
   %158 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %159 = lshr <4 x i32> %158, splat (i32 4)
+  %159 = lshr exact <4 x i32> %158, splat (i32 4)
   %160 = bitcast <4 x i32> %159 to <2 x i64>
   %161 = lshr <4 x i32> %151, splat (i32 6)
   %162 = bitcast <4 x i32> %161 to <2 x i64>
   %163 = and <2 x i64> %162, splat (i64 287104476311715840)
   %or.i317.i = or <2 x i64> %154, %and.i226.i
   %or.i320.i = or disjoint <2 x i64> %or.i317.i, %163
-  %or.i314.i = or <2 x i64> %or.i320.i, %160
+  %or.i314.i = or disjoint <2 x i64> %or.i320.i, %160
   %164 = bitcast <2 x i64> %or.i314.i to <4 x i32>
-  %sub.i.i = add <4 x i32> %164, splat (i32 983040)
+  %sub.i.i = add nuw nsw <4 x i32> %164, splat (i32 983040)
   %165 = lshr <4 x i32> %sub.i.i, splat (i32 10)
   %166 = bitcast <4 x i32> %165 to <2 x i64>
   %167 = and <2 x i64> %166, splat (i64 4393751544831)
@@ -42087,16 +42087,16 @@ if.then98.i:                                      ; preds = %if.else95.i
   %135 = and <2 x i64> %134, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %135, %and.i211.i
   %136 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %137 = lshr <4 x i32> %136, splat (i32 4)
+  %137 = lshr exact <4 x i32> %136, splat (i32 4)
   %138 = bitcast <4 x i32> %137 to <2 x i64>
   %139 = lshr <4 x i32> %129, splat (i32 6)
   %140 = bitcast <4 x i32> %139 to <2 x i64>
   %141 = and <2 x i64> %140, splat (i64 287104476311715840)
   %or.i290.i = or <2 x i64> %132, %and.i217.i
   %or.i293.i = or disjoint <2 x i64> %or.i290.i, %141
-  %or.i287.i = or <2 x i64> %or.i293.i, %138
+  %or.i287.i = or disjoint <2 x i64> %or.i293.i, %138
   %142 = bitcast <2 x i64> %or.i287.i to <4 x i32>
-  %sub.i.i = add <4 x i32> %142, splat (i32 983040)
+  %sub.i.i = add nuw nsw <4 x i32> %142, splat (i32 983040)
   %143 = lshr <4 x i32> %sub.i.i, splat (i32 10)
   %144 = bitcast <4 x i32> %143 to <2 x i64>
   %145 = and <2 x i64> %144, splat (i64 4393751544831)
@@ -42625,16 +42625,16 @@ if.then104.i:                                     ; preds = %if.else101.i
   %160 = and <2 x i64> %159, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %160, %and.i220.i
   %161 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %162 = lshr <4 x i32> %161, splat (i32 4)
+  %162 = lshr exact <4 x i32> %161, splat (i32 4)
   %163 = bitcast <4 x i32> %162 to <2 x i64>
   %164 = lshr <4 x i32> %154, splat (i32 6)
   %165 = bitcast <4 x i32> %164 to <2 x i64>
   %166 = and <2 x i64> %165, splat (i64 287104476311715840)
   %or.i317.i = or <2 x i64> %157, %and.i226.i
   %or.i320.i = or disjoint <2 x i64> %or.i317.i, %166
-  %or.i314.i = or <2 x i64> %or.i320.i, %163
+  %or.i314.i = or disjoint <2 x i64> %or.i320.i, %163
   %167 = bitcast <2 x i64> %or.i314.i to <4 x i32>
-  %sub.i.i = add <4 x i32> %167, splat (i32 983040)
+  %sub.i.i = add nuw nsw <4 x i32> %167, splat (i32 983040)
   %168 = lshr <4 x i32> %sub.i.i, splat (i32 10)
   %169 = bitcast <4 x i32> %168 to <2 x i64>
   %170 = and <2 x i64> %169, splat (i64 4393751544831)
@@ -43002,16 +43002,16 @@ if.then98.i.i:                                    ; preds = %if.else95.i.i
   %74 = and <2 x i64> %73, splat (i64 9007199256838144)
   %xor.i.i.i = xor <2 x i64> %74, %and.i211.i.i
   %75 = bitcast <2 x i64> %xor.i.i.i to <4 x i32>
-  %76 = lshr <4 x i32> %75, splat (i32 4)
+  %76 = lshr exact <4 x i32> %75, splat (i32 4)
   %77 = bitcast <4 x i32> %76 to <2 x i64>
   %78 = lshr <4 x i32> %68, splat (i32 6)
   %79 = bitcast <4 x i32> %78 to <2 x i64>
   %80 = and <2 x i64> %79, splat (i64 287104476311715840)
   %or.i290.i.i = or <2 x i64> %71, %and.i217.i.i
   %or.i293.i.i = or disjoint <2 x i64> %or.i290.i.i, %80
-  %or.i287.i.i = or <2 x i64> %or.i293.i.i, %77
+  %or.i287.i.i = or disjoint <2 x i64> %or.i293.i.i, %77
   %81 = bitcast <2 x i64> %or.i287.i.i to <4 x i32>
-  %sub.i.i.i = add <4 x i32> %81, splat (i32 983040)
+  %sub.i.i.i = add nuw nsw <4 x i32> %81, splat (i32 983040)
   %82 = lshr <4 x i32> %sub.i.i.i, splat (i32 10)
   %83 = bitcast <4 x i32> %82 to <2 x i64>
   %84 = and <2 x i64> %83, splat (i64 4393751544831)
@@ -43353,16 +43353,16 @@ if.then104.i.i:                                   ; preds = %if.else101.i.i
   %99 = and <2 x i64> %98, splat (i64 9007199256838144)
   %xor.i.i.i = xor <2 x i64> %99, %and.i220.i.i
   %100 = bitcast <2 x i64> %xor.i.i.i to <4 x i32>
-  %101 = lshr <4 x i32> %100, splat (i32 4)
+  %101 = lshr exact <4 x i32> %100, splat (i32 4)
   %102 = bitcast <4 x i32> %101 to <2 x i64>
   %103 = lshr <4 x i32> %93, splat (i32 6)
   %104 = bitcast <4 x i32> %103 to <2 x i64>
   %105 = and <2 x i64> %104, splat (i64 287104476311715840)
   %or.i317.i.i = or <2 x i64> %96, %and.i226.i.i
   %or.i320.i.i = or disjoint <2 x i64> %or.i317.i.i, %105
-  %or.i314.i.i = or <2 x i64> %or.i320.i.i, %102
+  %or.i314.i.i = or disjoint <2 x i64> %or.i320.i.i, %102
   %106 = bitcast <2 x i64> %or.i314.i.i to <4 x i32>
-  %sub.i.i.i = add <4 x i32> %106, splat (i32 983040)
+  %sub.i.i.i = add nuw nsw <4 x i32> %106, splat (i32 983040)
   %107 = lshr <4 x i32> %sub.i.i.i, splat (i32 10)
   %108 = bitcast <4 x i32> %107 to <2 x i64>
   %109 = and <2 x i64> %108, splat (i64 4393751544831)
@@ -43760,7 +43760,7 @@ if.then18.i:                                      ; preds = %while.body103.i
   %or.i285.i = or <2 x i64> %85, %and.i218.i
   %86 = bitcast <2 x i64> %or.i285.i to <8 x i16>
   %shuffle.i424.i = shufflevector <8 x i16> %86, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %conv.i425.i = zext <4 x i16> %shuffle.i424.i to <4 x i32>
+  %conv.i425.i = zext nneg <4 x i16> %shuffle.i424.i to <4 x i32>
   store <4 x i32> %conv.i425.i, ptr %utf32_output.addr.i.2610, align 1
   %add.ptr28.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.i.2610, i64 16
   %87 = bitcast <2 x i64> %or.i285.i to <8 x i16>
@@ -43816,7 +43816,7 @@ if.then63.i:                                      ; preds = %if.end56.i
   %or.i276.i = or <2 x i64> %107, %and.i203.i
   %108 = bitcast <2 x i64> %or.i276.i to <8 x i16>
   %shuffle.i418.i = shufflevector <8 x i16> %108, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %conv.i419.i = zext <4 x i16> %shuffle.i418.i to <4 x i32>
+  %conv.i419.i = zext nneg <4 x i16> %shuffle.i418.i to <4 x i32>
   store <4 x i32> %conv.i419.i, ptr %utf32_output.addr.i.2610, align 1
   %add.ptr80.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.i.2610, i64 16
   %109 = bitcast <2 x i64> %or.i276.i to <8 x i16>
@@ -43871,14 +43871,14 @@ if.then116.i:                                     ; preds = %if.else113.i
   %131 = and <2 x i64> %130, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %131, %and.i182.i
   %132 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %133 = lshr <4 x i32> %132, splat (i32 4)
+  %133 = lshr exact <4 x i32> %132, splat (i32 4)
   %134 = bitcast <4 x i32> %133 to <2 x i64>
   %135 = lshr <4 x i32> %125, splat (i32 6)
   %136 = bitcast <4 x i32> %135 to <2 x i64>
   %137 = and <2 x i64> %136, splat (i64 7881299349733376)
   %or.i264.i = or <2 x i64> %128, %and.i188.i
   %or.i267.i = or disjoint <2 x i64> %or.i264.i, %137
-  %or.i.i585 = or <2 x i64> %or.i267.i, %134
+  %or.i.i585 = or disjoint <2 x i64> %or.i267.i, %134
   store <2 x i64> %or.i.i585, ptr %utf32_output.addr.i.2610, align 1
   br label %if.end152.sink.split.i
 
@@ -44260,7 +44260,7 @@ if.then18.i:                                      ; preds = %while.body109.i
   %or.i285.i = or <2 x i64> %88, %and.i218.i
   %89 = bitcast <2 x i64> %or.i285.i to <8 x i16>
   %shuffle.i424.i = shufflevector <8 x i16> %89, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %conv.i425.i = zext <4 x i16> %shuffle.i424.i to <4 x i32>
+  %conv.i425.i = zext nneg <4 x i16> %shuffle.i424.i to <4 x i32>
   store <4 x i32> %conv.i425.i, ptr %utf32_output.addr.i.2636, align 1
   %add.ptr28.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.i.2636, i64 16
   %90 = bitcast <2 x i64> %or.i285.i to <8 x i16>
@@ -44316,7 +44316,7 @@ if.then63.i:                                      ; preds = %if.end56.i
   %or.i276.i = or <2 x i64> %110, %and.i203.i
   %111 = bitcast <2 x i64> %or.i276.i to <8 x i16>
   %shuffle.i418.i = shufflevector <8 x i16> %111, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %conv.i419.i = zext <4 x i16> %shuffle.i418.i to <4 x i32>
+  %conv.i419.i = zext nneg <4 x i16> %shuffle.i418.i to <4 x i32>
   store <4 x i32> %conv.i419.i, ptr %utf32_output.addr.i.2636, align 1
   %add.ptr80.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.i.2636, i64 16
   %112 = bitcast <2 x i64> %or.i276.i to <8 x i16>
@@ -44371,14 +44371,14 @@ if.then116.i:                                     ; preds = %if.else113.i
   %134 = and <2 x i64> %133, splat (i64 9007199256838144)
   %xor.i.i = xor <2 x i64> %134, %and.i182.i
   %135 = bitcast <2 x i64> %xor.i.i to <4 x i32>
-  %136 = lshr <4 x i32> %135, splat (i32 4)
+  %136 = lshr exact <4 x i32> %135, splat (i32 4)
   %137 = bitcast <4 x i32> %136 to <2 x i64>
   %138 = lshr <4 x i32> %128, splat (i32 6)
   %139 = bitcast <4 x i32> %138 to <2 x i64>
   %140 = and <2 x i64> %139, splat (i64 7881299349733376)
   %or.i264.i = or <2 x i64> %131, %and.i188.i
   %or.i267.i = or disjoint <2 x i64> %or.i264.i, %140
-  %or.i.i606 = or <2 x i64> %or.i267.i, %137
+  %or.i.i606 = or disjoint <2 x i64> %or.i267.i, %137
   store <2 x i64> %or.i.i606, ptr %utf32_output.addr.i.2636, align 1
   br label %if.end152.sink.split.i
 
@@ -44632,7 +44632,7 @@ if.then18.i.i:                                    ; preds = %while.body8.i
   %or.i285.i.i = or <2 x i64> %27, %and.i218.i.i
   %28 = bitcast <2 x i64> %or.i285.i.i to <8 x i16>
   %shuffle.i424.i.i = shufflevector <8 x i16> %28, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %conv.i425.i.i = zext <4 x i16> %shuffle.i424.i.i to <4 x i32>
+  %conv.i425.i.i = zext nneg <4 x i16> %shuffle.i424.i.i to <4 x i32>
   store <4 x i32> %conv.i425.i.i, ptr %utf32_output.addr.2110.i, align 1
   %add.ptr28.i.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.2110.i, i64 16
   %29 = bitcast <2 x i64> %or.i285.i.i to <8 x i16>
@@ -44688,7 +44688,7 @@ if.then63.i.i:                                    ; preds = %if.end56.i.i
   %or.i276.i.i = or <2 x i64> %49, %and.i203.i.i
   %50 = bitcast <2 x i64> %or.i276.i.i to <8 x i16>
   %shuffle.i418.i.i = shufflevector <8 x i16> %50, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %conv.i419.i.i = zext <4 x i16> %shuffle.i418.i.i to <4 x i32>
+  %conv.i419.i.i = zext nneg <4 x i16> %shuffle.i418.i.i to <4 x i32>
   store <4 x i32> %conv.i419.i.i, ptr %utf32_output.addr.2110.i, align 1
   %add.ptr80.i.i = getelementptr inbounds nuw i8, ptr %utf32_output.addr.2110.i, i64 16
   %51 = bitcast <2 x i64> %or.i276.i.i to <8 x i16>
@@ -44743,14 +44743,14 @@ if.then116.i.i:                                   ; preds = %if.else113.i.i
   %73 = and <2 x i64> %72, splat (i64 9007199256838144)
   %xor.i.i.i = xor <2 x i64> %73, %and.i182.i.i
   %74 = bitcast <2 x i64> %xor.i.i.i to <4 x i32>
-  %75 = lshr <4 x i32> %74, splat (i32 4)
+  %75 = lshr exact <4 x i32> %74, splat (i32 4)
   %76 = bitcast <4 x i32> %75 to <2 x i64>
   %77 = lshr <4 x i32> %67, splat (i32 6)
   %78 = bitcast <4 x i32> %77 to <2 x i64>
   %79 = and <2 x i64> %78, splat (i64 7881299349733376)
   %or.i264.i.i = or <2 x i64> %70, %and.i188.i.i
   %or.i267.i.i = or disjoint <2 x i64> %or.i264.i.i, %79
-  %or.i.i90.i = or <2 x i64> %or.i267.i.i, %76
+  %or.i.i90.i = or disjoint <2 x i64> %or.i267.i.i, %76
   store <2 x i64> %or.i.i90.i, ptr %utf32_output.addr.2110.i, align 1
   br label %if.end152.sink.split.i.i
 
@@ -45475,7 +45475,7 @@ if.then30.i:                                      ; preds = %if.end20.i
   %and.i.i.i = and <2 x i64> %in.0.i, splat (i64 17733194119839807)
   %or.i37.i.i = or disjoint <2 x i64> %and.i24.i.i, %and.i.i.i
   %19 = bitcast <2 x i64> %or.i37.i.i to <8 x i16>
-  %20 = or <8 x i16> %19, splat (i16 -16256)
+  %20 = or disjoint <8 x i16> %19, splat (i16 -16256)
   %21 = select <8 x i1> %cmp.i410.i, <8 x i16> %8, <8 x i16> %20
   %22 = bitcast <8 x i16> %21 to <16 x i8>
   %23 = lshr i16 %11, 7
@@ -45791,7 +45791,7 @@ if.then35.i:                                      ; preds = %if.end25.i
   %and.i.i.i = and <2 x i64> %in.0.i, splat (i64 17733194119839807)
   %or.i37.i.i = or disjoint <2 x i64> %and.i24.i.i, %and.i.i.i
   %23 = bitcast <2 x i64> %or.i37.i.i to <8 x i16>
-  %24 = or <8 x i16> %23, splat (i16 -16256)
+  %24 = or disjoint <8 x i16> %23, splat (i16 -16256)
   %25 = select <8 x i1> %cmp.i519.i, <8 x i16> %12, <8 x i16> %24
   %26 = bitcast <8 x i16> %25 to <16 x i8>
   %27 = lshr i16 %15, 7
@@ -46105,7 +46105,7 @@ if.then30.i:                                      ; preds = %if.end20.i
   %and.i.i.i = and <2 x i64> %in.0.i, splat (i64 17733194119839807)
   %or.i37.i.i = or disjoint <2 x i64> %and.i24.i.i, %and.i.i.i
   %19 = bitcast <2 x i64> %or.i37.i.i to <8 x i16>
-  %20 = or <8 x i16> %19, splat (i16 -16256)
+  %20 = or disjoint <8 x i16> %19, splat (i16 -16256)
   %21 = select <8 x i1> %cmp.i424.i, <8 x i16> %8, <8 x i16> %20
   %22 = bitcast <8 x i16> %21 to <16 x i8>
   %23 = lshr i16 %11, 7
@@ -46439,7 +46439,7 @@ if.then35.i:                                      ; preds = %if.end25.i
   %and.i.i.i = and <2 x i64> %in.0.i, splat (i64 17733194119839807)
   %or.i37.i.i = or disjoint <2 x i64> %and.i24.i.i, %and.i.i.i
   %23 = bitcast <2 x i64> %or.i37.i.i to <8 x i16>
-  %24 = or <8 x i16> %23, splat (i16 -16256)
+  %24 = or disjoint <8 x i16> %23, splat (i16 -16256)
   %25 = select <8 x i1> %cmp.i533.i, <8 x i16> %12, <8 x i16> %24
   %26 = bitcast <8 x i16> %25 to <16 x i8>
   %27 = lshr i16 %15, 7
@@ -47191,7 +47191,7 @@ if.then48.i:                                      ; preds = %if.end38.i
   %and.i275.i = and <2 x i64> %in_16.0.i, splat (i64 17733194119839807)
   %or.i375.i = or disjoint <2 x i64> %and.i278.i, %and.i275.i
   %32 = bitcast <2 x i64> %or.i375.i to <8 x i16>
-  %33 = or <8 x i16> %32, splat (i16 -16256)
+  %33 = or disjoint <8 x i16> %32, splat (i16 -16256)
   %34 = select <8 x i1> %cmp.i587.i, <8 x i16> %22, <8 x i16> %33
   %35 = bitcast <8 x i16> %34 to <16 x i8>
   %36 = lshr i16 %25, 7
@@ -47572,7 +47572,7 @@ if.then66.i:                                      ; preds = %if.end55.i
   %and.i328.i = and <2 x i64> %in_16.0.i, splat (i64 17733194119839807)
   %or.i426.i = or disjoint <2 x i64> %and.i331.i, %and.i328.i
   %37 = bitcast <2 x i64> %or.i426.i to <8 x i16>
-  %38 = or <8 x i16> %37, splat (i16 -16256)
+  %38 = or disjoint <8 x i16> %37, splat (i16 -16256)
   %39 = select <8 x i1> %cmp.i635.i, <8 x i16> %26, <8 x i16> %38
   %40 = bitcast <8 x i16> %39 to <16 x i8>
   %41 = lshr i16 %29, 7
diff --git a/bench/ocio/optimized/Lut1DOpCPU_SSE2.ll b/bench/ocio/optimized/Lut1DOpCPU_SSE2.ll
index b4b55e71de2..8b9752a0bda 100644
--- a/bench/ocio/optimized/Lut1DOpCPU_SSE2.ll
+++ b/bench/ocio/optimized/Lut1DOpCPU_SSE2.ll
@@ -2340,7 +2340,7 @@ define internal void @_ZN19OpenColorIO_v2_5dev12_GLOBAL__N_18linear1DILNS_8BitDe
   %164 = bitcast <4 x float> %163 to <2 x i64>
   %165 = and <2 x i64> %164, splat (i64 17587891081215)
   %166 = bitcast <2 x i64> %158 to <4 x i32>
-  %167 = icmp sgt <4 x i32> %166, splat (i32 2139095040)
+  %167 = icmp samesign ugt <4 x i32> %166, splat (i32 2139095040)
   %168 = sext <4 x i1> %167 to <4 x i32>
   %169 = bitcast <4 x i32> %168 to <16 x i8>
   %170 = icmp slt <16 x i8> %169, zeroinitializer
@@ -2376,7 +2376,7 @@ _ZN19OpenColorIO_v2_5devL13sse2_cvtps_phEDv4_f.exit.i: ; preds = %172, %.lr.ph
   %193 = bitcast <4 x float> %192 to <2 x i64>
   %194 = and <2 x i64> %193, splat (i64 17587891081215)
   %195 = bitcast <2 x i64> %187 to <4 x i32>
-  %196 = icmp sgt <4 x i32> %195, splat (i32 2139095040)
+  %196 = icmp samesign ugt <4 x i32> %195, splat (i32 2139095040)
   %197 = sext <4 x i1> %196 to <4 x i32>
   %198 = bitcast <4 x i32> %197 to <16 x i8>
   %199 = icmp slt <16 x i8> %198, zeroinitializer
@@ -2412,7 +2412,7 @@ _ZN19OpenColorIO_v2_5devL13sse2_cvtps_phEDv4_f.exit13.i: ; preds = %201, %_ZN19O
   %222 = bitcast <4 x float> %221 to <2 x i64>
   %223 = and <2 x i64> %222, splat (i64 17587891081215)
   %224 = bitcast <2 x i64> %216 to <4 x i32>
-  %225 = icmp sgt <4 x i32> %224, splat (i32 2139095040)
+  %225 = icmp samesign ugt <4 x i32> %224, splat (i32 2139095040)
   %226 = sext <4 x i1> %225 to <4 x i32>
   %227 = bitcast <4 x i32> %226 to <16 x i8>
   %228 = icmp slt <16 x i8> %227, zeroinitializer
@@ -2448,7 +2448,7 @@ _ZN19OpenColorIO_v2_5devL13sse2_cvtps_phEDv4_f.exit16.i: ; preds = %230, %_ZN19O
   %251 = bitcast <4 x float> %250 to <2 x i64>
   %252 = and <2 x i64> %251, splat (i64 17587891081215)
   %253 = bitcast <2 x i64> %245 to <4 x i32>
-  %254 = icmp sgt <4 x i32> %253, splat (i32 2139095040)
+  %254 = icmp samesign ugt <4 x i32> %253, splat (i32 2139095040)
   %255 = sext <4 x i1> %254 to <4 x i32>
   %256 = bitcast <4 x i32> %255 to <16 x i8>
   %257 = icmp slt <16 x i8> %256, zeroinitializer
@@ -2748,7 +2748,7 @@ _ZN19OpenColorIO_v2_5dev12SSE2RGBAPackILNS_8BitDepthE7EE5StoreEPN9Imath_3_14half
   %473 = bitcast <4 x float> %472 to <2 x i64>
   %474 = and <2 x i64> %473, splat (i64 17587891081215)
   %475 = bitcast <2 x i64> %467 to <4 x i32>
-  %476 = icmp sgt <4 x i32> %475, splat (i32 2139095040)
+  %476 = icmp samesign ugt <4 x i32> %475, splat (i32 2139095040)
   %477 = sext <4 x i1> %476 to <4 x i32>
   %478 = bitcast <4 x i32> %477 to <16 x i8>
   %479 = icmp slt <16 x i8> %478, zeroinitializer
@@ -2784,7 +2784,7 @@ _ZN19OpenColorIO_v2_5devL13sse2_cvtps_phEDv4_f.exit.i146: ; preds = %481, %._cri
   %502 = bitcast <4 x float> %501 to <2 x i64>
   %503 = and <2 x i64> %502, splat (i64 17587891081215)
   %504 = bitcast <2 x i64> %496 to <4 x i32>
-  %505 = icmp sgt <4 x i32> %504, splat (i32 2139095040)
+  %505 = icmp samesign ugt <4 x i32> %504, splat (i32 2139095040)
   %506 = sext <4 x i1> %505 to <4 x i32>
   %507 = bitcast <4 x i32> %506 to <16 x i8>
   %508 = icmp slt <16 x i8> %507, zeroinitializer
@@ -2820,7 +2820,7 @@ _ZN19OpenColorIO_v2_5devL13sse2_cvtps_phEDv4_f.exit13.i149: ; preds = %510, %_ZN
   %531 = bitcast <4 x float> %530 to <2 x i64>
   %532 = and <2 x i64> %531, splat (i64 17587891081215)
   %533 = bitcast <2 x i64> %525 to <4 x i32>
-  %534 = icmp sgt <4 x i32> %533, splat (i32 2139095040)
+  %534 = icmp samesign ugt <4 x i32> %533, splat (i32 2139095040)
   %535 = sext <4 x i1> %534 to <4 x i32>
   %536 = bitcast <4 x i32> %535 to <16 x i8>
   %537 = icmp slt <16 x i8> %536, zeroinitializer
@@ -2856,7 +2856,7 @@ _ZN19OpenColorIO_v2_5devL13sse2_cvtps_phEDv4_f.exit16.i152: ; preds = %539, %_ZN
   %560 = bitcast <4 x float> %559 to <2 x i64>
   %561 = and <2 x i64> %560, splat (i64 17587891081215)
   %562 = bitcast <2 x i64> %554 to <4 x i32>
-  %563 = icmp sgt <4 x i32> %562, splat (i32 2139095040)
+  %563 = icmp samesign ugt <4 x i32> %562, splat (i32 2139095040)
   %564 = sext <4 x i1> %563 to <4 x i32>
   %565 = bitcast <4 x i32> %564 to <16 x i8>
   %566 = icmp slt <16 x i8> %565, zeroinitializer
diff --git a/scripts/setup_pre_commit_patch.sh b/scripts/setup_pre_commit_patch.sh
index c5409e09ef4..66f502762d9 100755
--- a/scripts/setup_pre_commit_patch.sh
+++ b/scripts/setup_pre_commit_patch.sh
@@ -2,7 +2,7 @@
 set -euo pipefail
 shopt -s inherit_errexit
 
-export GITHUB_PATCH_ID="<user_name>/llvm-project/commit/<commit_hash>"
+export GITHUB_PATCH_ID=llvm/llvm-project/pull/125935
 export COMPTIME_MODE=0
 
 # Please rebase manually