graalvm
diff --git a/‎compiler/src/jdk.internal.vm.compiler.test/src/org/graalvm/compiler/replacements/test/StringCompressInflateTest.java‎
Lines changed: 24 additions & 0 deletions b/‎compiler/src/jdk.internal.vm.compiler.test/src/org/graalvm/compiler/replacements/test/StringCompressInflateTest.java‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/asm/aarch64/AArch64ASIMDAssembler.java‎
Lines changed: 54 additions & 2 deletions b/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/asm/aarch64/AArch64ASIMDAssembler.java‎
Lines changed: 54 additions & 2 deletions
diff --git a/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/lir/aarch64/AArch64ArrayCompareToOp.java‎
Lines changed: 5 additions & 2 deletions b/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/lir/aarch64/AArch64ArrayCompareToOp.java‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/lir/aarch64/AArch64ArrayEqualsOp.java‎
Lines changed: 13 additions & 10 deletions b/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/lir/aarch64/AArch64ArrayEqualsOp.java‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/lir/aarch64/AArch64ArrayIndexOfOp.java‎
Lines changed: 5 additions & 3 deletions b/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/lir/aarch64/AArch64ArrayIndexOfOp.java‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/lir/aarch64/AArch64ArrayRegionCompareToOp.java‎
Lines changed: 6 additions & 11 deletions b/‎compiler/src/jdk.internal.vm.compiler/src/org/graalvm/compiler/lir/aarch64/AArch64ArrayRegionCompareToOp.java‎
Lines changed: 6 additions & 11 deletions
@@ -27,6 +27,9 @@
 import static org.junit.Assume.assumeTrue;
 
 import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+
+import org.junit.Assert;
 
 import org.graalvm.compiler.core.common.CompilationIdentifier;
 import org.graalvm.compiler.nodes.Invoke;
@@ -295,6 +298,27 @@ public void testStringUTF16CompressCharByte() throws ClassNotFoundException {
                 }
             }
         }
+
+        // Exhaustively check compress returning the correct index of the non-latin1 char.
+        final int size = 48;
+        final byte fillByte = 'R';
+        char[] chars = new char[size];
+        final byte[] bytes = new byte[chars.length];
+        Arrays.fill(bytes, fillByte);
+        for (int i = 0; i < size; i++) { // Every starting index
+            for (int j = i; j < size; j++) {  // Every location of non-latin1
+                Arrays.fill(chars, 'A');
+                chars[j] = 0xFF21;
+                byte[] dst = Arrays.copyOf(bytes, bytes.length);
+                byte[] dst2 = Arrays.copyOf(bytes, bytes.length);
+                int result = (int) invokeSafe(caller, null, chars, i, dst, 0, chars.length - i);
+                int result2 = (int) executeVarargsSafe(code, chars, i, dst2, 0, chars.length - i);
+                Assert.assertEquals(result, result2);
+                Assert.assertArrayEquals(dst, dst2);
+                Assert.assertEquals("compress found wrong index", j - i, result);
+                Assert.assertEquals("extra character stored", fillByte, bytes[j]);
+            }
+        }
     }
 
     public static void getCharsSnippet(String s, int srcBegin, int srcEnd, char[] dst, int dstBegin) {
 
@@ -704,7 +704,9 @@ public enum ASIMDInstruction {
         CMHS(UBit | 0b00111 << 11),
         USHL(UBit | 0b01000 << 11),
         UMAX(UBit | 0b01100 << 11),
+        UMAXP(UBit | 0b10100 << 11),
         UMIN(UBit | 0b01101 << 11),
+        UMINP(UBit | 0b10101 << 11),
         SUB(UBit | 0b10000 << 11),
         CMEQ(UBit | 0b10001 << 11),
         MLS(UBit | 0b10010 << 11),
@@ -3556,6 +3558,31 @@ public void umaxVVV(ASIMDSize size, ElementSize eSize, Register dst, Register sr
         threeSameEncoding(ASIMDInstruction.UMAX, size, elemSizeXX(eSize), dst, src1, src2);
     }
 
+    /**
+     * C7.2.361 Unsigned maximum pairwise.<br>
+     *
+     * <code>
+     *     concat = src2:src1
+     *     for i in 0..n-1 do dst[i] = uint_max(concat[2 * i], concat[2 * i + 1])
+     * </code>
+     *
+     * @param size register size.
+     * @param eSize element size.
+     * @param dst SIMD register.
+     * @param src1 SIMD register.
+     * @param src2 SIMD register.
+     */
+    public void umaxpVVV(ASIMDSize size, ElementSize eSize, Register dst, Register src1, Register src2) {
+        assert usesMultipleLanes(size, eSize) : "Must use multiple lanes " + size + " " + eSize;
+
+        assert dst.getRegisterCategory().equals(SIMD) : dst;
+        assert src1.getRegisterCategory().equals(SIMD) : src1;
+        assert src2.getRegisterCategory().equals(SIMD) : src2;
+        assert eSize != ElementSize.DoubleWord : "Invalid lane width for umaxp";
+
+        threeSameEncoding(ASIMDInstruction.UMAXP, size, elemSizeXX(eSize), dst, src1, src2);
+    }
+
     /**
      * C7.2.362 Unsigned maximum across vector.<br>
      *
@@ -3567,8 +3594,8 @@ public void umaxVVV(ASIMDSize size, ElementSize eSize, Register dst, Register sr
      * @param src SIMD register.
      */
     public void umaxvSV(ASIMDSize size, ElementSize elementSize, Register dst, Register src) {
-        assert dst.getRegisterCategory().equals(SIMD);
-        assert src.getRegisterCategory().equals(SIMD);
+        assert dst.getRegisterCategory().equals(SIMD) : dst;
+        assert src.getRegisterCategory().equals(SIMD) : src;
         assert !(size == ASIMDSize.HalfReg && elementSize == ElementSize.Word) : "Invalid size and lane combination for umaxv";
         assert elementSize != ElementSize.DoubleWord : "Invalid lane width for umaxv";
 
@@ -3597,6 +3624,31 @@ public void uminVVV(ASIMDSize size, ElementSize eSize, Register dst, Register sr
         threeSameEncoding(ASIMDInstruction.UMIN, size, elemSizeXX(eSize), dst, src1, src2);
     }
 
+    /**
+     * C7.2.364 Unsigned minimum pairwise.<br>
+     *
+     * <code>
+     *     concat = src2:src1
+     *     for i in 0..n-1 do dst[i] = uint_min(concat[2 * i], concat[2 * i + 1])
+     * </code>
+     *
+     * @param size register size.
+     * @param eSize element size.
+     * @param dst SIMD register.
+     * @param src1 SIMD register.
+     * @param src2 SIMD register.
+     */
+    public void uminpVVV(ASIMDSize size, ElementSize eSize, Register dst, Register src1, Register src2) {
+        assert usesMultipleLanes(size, eSize) : "Must use multiple lanes " + size + " " + eSize;
+
+        assert dst.getRegisterCategory().equals(SIMD) : dst;
+        assert src1.getRegisterCategory().equals(SIMD) : src1;
+        assert src2.getRegisterCategory().equals(SIMD) : src2;
+        assert eSize != ElementSize.DoubleWord : "Invalid lane width for uminp";
+
+        threeSameEncoding(ASIMDInstruction.UMINP, size, elemSizeXX(eSize), dst, src1, src2);
+    }
+
     /**
      * C7.2.365 Unsigned minimum across vector.<br>
      *
 
@@ -333,8 +333,11 @@ private void emitSIMDCode(AArch64MacroAssembler masm, Label stringsEqualUptoLeng
         masm.neon.cmeqVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, eSize, array1HighV, array1HighV, array2HighV);
         masm.neon.andVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, tmpRegV1, array1LowV, array1HighV);
         masm.neon.uminvSV(AArch64ASIMDAssembler.ASIMDSize.FullReg, eSize, tmpRegV1, tmpRegV1);
-        masm.fcmpZero(64, tmpRegV1);
-        masm.branchConditionally(ConditionFlag.EQ, mismatchInChunk);
+        try (AArch64MacroAssembler.ScratchRegister scratchReg = masm.getScratchRegister()) {
+            Register tmp = scratchReg.getRegister();
+            masm.neon.umovGX(AArch64ASIMDAssembler.ElementSize.DoubleWord, tmp, tmpRegV1, 0);
+            masm.cbz(64, tmp, mismatchInChunk);
+        }
         masm.cmp(64, array1, lastChunkAddress1);
         masm.branchConditionally(ConditionFlag.LO, simdLoop);
 
 
@@ -231,7 +231,7 @@ private void emitArrayEquals(AArch64MacroAssembler asm,
         Register refAddress = len;
         asm.add(64, refAddress, arrayMax, len, ShiftType.LSL, strideMax.log2);
 
-        simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
+        simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, tmp);
         asm.branchConditionally(ConditionFlag.NE, end);
 
         asm.cmp(64, refAddress, arrayMax);
@@ -248,7 +248,7 @@ private void emitArrayEquals(AArch64MacroAssembler asm,
         // 64 byte loop
         asm.align(PREFERRED_LOOP_ALIGNMENT);
         asm.bind(vectorLoop);
-        simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
+        simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, tmp);
         asm.branchConditionally(ConditionFlag.NE, end);
         asm.cmp(64, arrayMax, refAddress);
         asm.branchConditionally(ConditionFlag.LO, vectorLoop);
@@ -262,13 +262,13 @@ private void emitArrayEquals(AArch64MacroAssembler asm,
             asm.sub(64, arrayM, arrayM, tmp, ShiftType.LSR, strideMax.log2 - strideM.log2);
         }
 
-        simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
+        simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, tmp);
         asm.jmp(end);
 
         // tail for 32 - 63 bytes
-        tail32(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, len, tailLessThan64, tailLessThan32, end);
+        tail32(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, len, tmp, tailLessThan64, tailLessThan32, end);
         // tail for 16 - 31 bytes
-        tail16(asm, strideA, strideB, strideM, strideMax, strideMin, arrayA, arrayB, arrayM, len, tailLessThan32, tailLessThan16, end);
+        tail16(asm, strideA, strideB, strideM, strideMax, strideMin, arrayA, arrayB, arrayM, len, tmp, tailLessThan32, tailLessThan16, end);
         // tail for 8 - 15 bytes
         tailLessThan16(asm, strideA, strideB, strideM, strideMax, arrayA, arrayB, arrayM, len, tmp, ret, tailLessThan16, tailLessThan8, end, 8);
         // tail for 4 - 7 bytes
@@ -290,7 +290,8 @@ private void simdCompare64(AArch64MacroAssembler asm,
                     Stride strideMask,
                     Register arrayMax,
                     Register arrayMin,
-                    Register arrayMask) {
+                    Register arrayMask,
+                    Register tmp) {
         ElementSize minESize = fromStride(strideMin);
         switch (strideMax.log2 - strideMin.log2) {
             case 0:
@@ -419,7 +420,7 @@ private void simdCompare64(AArch64MacroAssembler asm,
             default:
                 throw GraalError.unimplemented("comparison of " + strideMin + " to " + strideMax + " not implemented"); // ExcludeFromJacocoGeneratedReport
         }
-        vectorCheckZero(asm, v(0), v(0));
+        cmpZeroVector(asm, v(0), v(0), tmp);
     }
 
     private void tail32(AArch64MacroAssembler asm,
@@ -431,6 +432,7 @@ private void tail32(AArch64MacroAssembler asm,
                     Register arrayMin,
                     Register arrayMask,
                     Register len,
+                    Register tmp,
                     Label entry,
                     Label nextTail,
                     Label end) {
@@ -594,7 +596,7 @@ private void tail32(AArch64MacroAssembler asm,
             default:
                 throw GraalError.unimplemented("comparison of " + strideMin + " to " + strideMax + " not implemented"); // ExcludeFromJacocoGeneratedReport
         }
-        vectorCheckZero(asm, v(0), v(0));
+        cmpZeroVector(asm, v(0), v(0), tmp);
         asm.jmp(end);
     }
 
@@ -608,6 +610,7 @@ private void tail16(AArch64MacroAssembler asm,
                     Register arrayB,
                     Register arrayM,
                     Register len,
+                    Register tmp,
                     Label entry,
                     Label nextTail,
                     Label end) {
@@ -662,7 +665,7 @@ private void tail16(AArch64MacroAssembler asm,
         asm.neon.eorVVV(FullReg, vecArrayA2, vecArrayA2, vecArrayB2);
         asm.neon.orrVVV(FullReg, vecArrayA1, vecArrayA1, vecArrayA2);
 
-        vectorCheckZero(asm, vecArrayA1, vecArrayA1);
+        cmpZeroVector(asm, vecArrayA1, vecArrayA1, tmp);
         asm.jmp(end);
     }
 
@@ -752,7 +755,7 @@ private void tailLessThan16(AArch64MacroAssembler asm,
                 asm.neon.orrVVV(FullReg, vecArrayA1, vecArrayA1, vecArrayM1);
             }
             asm.neon.eorVVV(FullReg, vecArrayA1, vecArrayA1, vecArrayB1);
-            vectorCheckZero(asm, vecArrayA1, vecArrayA1);
+            cmpZeroVector(asm, vecArrayA1, vecArrayA1, tmp);
         } else if (strideMax.value == nBytes) {
             asm.bind(entry);
             // tail for length == 1
 
@@ -631,9 +631,11 @@ private void emitSIMDMatch(AArch64MacroAssembler masm,
                 break;
         }
         masm.neon.orrVVV(FullReg, vecTmp[0], vecArray1, vecArray2);
-        /* If value != 0, then there was a match somewhere. */
-        vectorCheckZero(masm, ElementSize.fromStride(getMatchResultStride()), vecTmp[0], vecTmp[0], variant != ArrayIndexOfVariant.Table);
-        masm.branchConditionally(ConditionFlag.NE, matchInChunk);
+        try (ScratchRegister sc = masm.getScratchRegister()) {
+            Register tmp = sc.getRegister();
+            /* If value != 0, then there was a match somewhere. */
+            cbnzVector(masm, ElementSize.fromStride(getMatchResultStride()), vecTmp[0], vecTmp[0], tmp, variant != ArrayIndexOfVariant.Table, matchInChunk);
+        }
     }
 
     private Stride getMatchResultStride() {
 
@@ -202,8 +202,7 @@ private void emitArrayCompare(CompilationResultBuilder crb, AArch64MacroAssemble
         asm.neon.eorVVV(FullReg, vecTmp1, vecArrayA1, vecArrayB1);
         asm.neon.eorVVV(FullReg, vecTmp2, vecArrayA2, vecArrayB2);
         asm.neon.orrVVV(FullReg, vecTmp2, vecTmp2, vecTmp1);
-        vectorCheckZero(asm, vecTmp2, vecTmp2);
-        asm.branchConditionally(ConditionFlag.NE, diffFound);
+        cbnzVector(asm, ElementSize.Byte, vecTmp2, vecTmp2, tmp, false, diffFound);
         // if so, continue
         asm.cmp(64, maxStrideArray, refAddress);
         asm.branchConditionally(ConditionFlag.LO, vectorLoop);
@@ -218,8 +217,7 @@ private void emitArrayCompare(CompilationResultBuilder crb, AArch64MacroAssemble
         asm.neon.eorVVV(FullReg, vecTmp1, vecArrayA1, vecArrayB1);
         asm.neon.eorVVV(FullReg, vecTmp2, vecArrayA2, vecArrayB2);
         asm.neon.orrVVV(FullReg, vecTmp2, vecTmp2, vecTmp1);
-        vectorCheckZero(asm, vecTmp2, vecTmp2);
-        asm.branchConditionally(ConditionFlag.NE, diffFound);
+        cbnzVector(asm, ElementSize.Byte, vecTmp2, vecTmp2, tmp, false, diffFound);
         asm.mov(64, ret, zr);
         asm.jmp(end);
 
@@ -239,8 +237,7 @@ private void emitArrayCompare(CompilationResultBuilder crb, AArch64MacroAssemble
         asm.align(PREFERRED_BRANCH_TARGET_ALIGNMENT);
         asm.bind(diffFound);
         // check if vecArrayA1 and vecArrayB1 are equal
-        vectorCheckZero(asm, vecTmp1, vecTmp1);
-        asm.branchConditionally(ConditionFlag.NE, returnV1);
+        cbnzVector(asm, ElementSize.Byte, vecTmp1, vecTmp1, tmp, false, returnV1);
         calcReturnValue(asm, ret, vecArrayA2, vecArrayB2, vecArrayA1, vecArrayB1, vecMask, strideMax);
         asm.jmp(end);
 
@@ -295,11 +292,9 @@ private static void loadAndExtend(AArch64MacroAssembler asm, Stride strideDst, S
     private static void calcReturnValue(AArch64MacroAssembler asm, Register ret, Register vecArrayA, Register vecArrayB, Register vecTmp, Register vecIndex, Register vecMask, Stride strideMax) {
         // set all equal bytes to 0xff, others to 0x00
         asm.neon.cmeqVVV(FullReg, fromStride(strideMax), vecTmp, vecArrayA, vecArrayB);
-        // BIC with the ascending index mask, this will replace all non-equal bytes with their
-        // corresponding byte index
-        asm.neon.bicVVV(FullReg, vecIndex, vecMask, vecTmp);
-        // OR with the result of CMEQ, replacing all equal bytes with 0xff again
-        asm.neon.orrVVV(FullReg, vecIndex, vecIndex, vecTmp);
+        // OR with the ascending index mask, this will replace all non-equal bytes with their
+        // corresponding byte index, and all equal bytes with 0xff
+        asm.neon.orrVVV(FullReg, vecIndex, vecMask, vecTmp);
         // Get the unsigned minimum. This will yield the index of the first non-equal bytes, since
         // all equal ones are filled with 0xff
         asm.neon.uminvSV(FullReg, fromStride(strideMax), vecIndex, vecIndex);
Original file line number	Diff line number	Diff line change
`@@ -631,9 +631,11 @@ private void emitSIMDMatch(AArch64MacroAssembler masm,`
`631`	`631`	`break;`
`632`	`632`	`}`
`633`	`633`	`masm.neon.orrVVV(FullReg, vecTmp[0], vecArray1, vecArray2);`
`634`		`- /* If value != 0, then there was a match somewhere. */`
`635`		`- vectorCheckZero(masm, ElementSize.fromStride(getMatchResultStride()), vecTmp[0], vecTmp[0], variant != ArrayIndexOfVariant.Table);`
`636`		`- masm.branchConditionally(ConditionFlag.NE, matchInChunk);`
	`634`	`+ try (ScratchRegister sc = masm.getScratchRegister()) {`
	`635`	`+ Register tmp = sc.getRegister();`
	`636`	`+ /* If value != 0, then there was a match somewhere. */`
	`637`	`+ cbnzVector(masm, ElementSize.fromStride(getMatchResultStride()), vecTmp[0], vecTmp[0], tmp, variant != ArrayIndexOfVariant.Table, matchInChunk);`
	`638`	`+ }`
`637`	`639`	`}`
`638`	`640`
`639`	`641`	`private Stride getMatchResultStride() {`