|
97 | 97 | import jdk.vm.ci.code.Register;
|
98 | 98 | import jdk.vm.ci.meta.AllocatableValue;
|
99 | 99 | import jdk.vm.ci.meta.JavaConstant;
|
| 100 | +import jdk.vm.ci.meta.PlatformKind; |
100 | 101 | import jdk.vm.ci.meta.Value;
|
101 | 102 |
|
102 | 103 | public class AMD64VectorShuffle {
|
@@ -360,6 +361,132 @@ private void emitBytePermute(CompilationResultBuilder crb, AMD64MacroAssembler m
|
360 | 361 | }
|
361 | 362 | }
|
362 | 363 |
|
| 364 | + /** |
| 365 | + * A slice operation, see {@link jdk.graal.compiler.vector.nodes.amd64.AMD64SimdSliceNode}. |
| 366 | + */ |
| 367 | + public static final class SliceOp extends AMD64LIRInstruction { |
| 368 | + public static final LIRInstructionClass<SliceOp> TYPE = LIRInstructionClass.create(SliceOp.class); |
| 369 | + |
| 370 | + @Def({OperandFlag.REG}) protected AllocatableValue result; |
| 371 | + @Alive({OperandFlag.REG}) protected AllocatableValue src1; |
| 372 | + @Alive({OperandFlag.REG}) protected AllocatableValue src2; |
| 373 | + @Temp({OperandFlag.REG, OperandFlag.ILLEGAL}) protected AllocatableValue tmp1; |
| 374 | + @Temp({OperandFlag.REG, OperandFlag.ILLEGAL}) protected AllocatableValue tmp2; |
| 375 | + private final int originInBytes; |
| 376 | + private final AMD64SIMDInstructionEncoding encoding; |
| 377 | + |
| 378 | + public SliceOp(AMD64LIRGenerator gen, AllocatableValue result, AllocatableValue src1, AllocatableValue src2, int origin, AMD64SIMDInstructionEncoding encoding) { |
| 379 | + super(TYPE); |
| 380 | + AMD64Kind eKind = ((AMD64Kind) result.getPlatformKind()).getScalar(); |
| 381 | + this.result = result; |
| 382 | + this.src1 = src1; |
| 383 | + this.src2 = src2; |
| 384 | + this.originInBytes = origin * eKind.getSizeInBytes(); |
| 385 | + this.encoding = encoding; |
| 386 | + allocateTempIfNecessary(gen); |
| 387 | + } |
| 388 | + |
| 389 | + @Override |
| 390 | + public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { |
| 391 | + int resultSize = result.getPlatformKind().getSizeInBytes(); |
| 392 | + switch (resultSize) { |
| 393 | + case 4 -> { |
| 394 | + if (src1.equals(src2) && originInBytes == 2) { |
| 395 | + VexRMIOp.VPSHUFLW.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src1), 0x1); |
| 396 | + } else { |
| 397 | + VexRMIOp.VPSHUFD.encoding(encoding).emit(masm, XMM, asRegister(tmp1), asRegister(src1), 0); |
| 398 | + VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src2), asRegister(tmp1), originInBytes + 12); |
| 399 | + } |
| 400 | + } |
| 401 | + case 8 -> { |
| 402 | + if (src1.equals(src2) && originInBytes % 2 == 0) { |
| 403 | + int imm; |
| 404 | + if (originInBytes == 2) { |
| 405 | + imm = 0b00111001; |
| 406 | + } else if (originInBytes == 4) { |
| 407 | + imm = 0b01001110; |
| 408 | + } else { |
| 409 | + GraalError.guarantee(originInBytes == 6, "unexpected originInBytes %d", originInBytes); |
| 410 | + imm = 0b10010011; |
| 411 | + } |
| 412 | + VexRMIOp.VPSHUFLW.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src1), imm); |
| 413 | + } else { |
| 414 | + VexRMIOp.VPSHUFD.encoding(encoding).emit(masm, XMM, asRegister(tmp1), asRegister(src1), 0x40); |
| 415 | + VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src2), asRegister(tmp1), originInBytes + 8); |
| 416 | + } |
| 417 | + } |
| 418 | + case 16 -> VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src2), asRegister(src1), originInBytes); |
| 419 | + case 32 -> { |
| 420 | + if (encoding == AMD64SIMDInstructionEncoding.VEX || originInBytes % Integer.BYTES != 0) { |
| 421 | + Register tmp = originInBytes == 16 ? asRegister(result) : asRegister(tmp1); |
| 422 | + if (encoding == AMD64SIMDInstructionEncoding.VEX) { |
| 423 | + VexRVMIOp.VPERM2I128.emit(masm, YMM, tmp, asRegister(src1), asRegister(src2), 0x21); |
| 424 | + } else { |
| 425 | + VexRVMIOp.EVALIGND.emit(masm, YMM, tmp, asRegister(src2), asRegister(src1), 4); |
| 426 | + } |
| 427 | + if (originInBytes < 16) { |
| 428 | + VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, YMM, asRegister(result), asRegister(tmp1), asRegister(src1), originInBytes); |
| 429 | + } else if (originInBytes > 16) { |
| 430 | + VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, YMM, asRegister(result), asRegister(src2), asRegister(tmp1), originInBytes - 16); |
| 431 | + } |
| 432 | + } else { |
| 433 | + VexRVMIOp.EVALIGND.emit(masm, YMM, asRegister(result), asRegister(src2), asRegister(src1), originInBytes / Integer.BYTES); |
| 434 | + } |
| 435 | + } |
| 436 | + case 64 -> { |
| 437 | + GraalError.guarantee(encoding == AMD64SIMDInstructionEncoding.EVEX, "unexpected encoding with 512-bit vector"); |
| 438 | + if (originInBytes % 4 != 0) { |
| 439 | + if (originInBytes < 16) { |
| 440 | + VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp1), asRegister(src2), asRegister(src1), 4); |
| 441 | + VexRVMIOp.EVPALIGNR.emit(masm, ZMM, asRegister(result), asRegister(tmp1), asRegister(src1), originInBytes); |
| 442 | + } else if (originInBytes < 32) { |
| 443 | + VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp1), asRegister(src2), asRegister(src1), 4); |
| 444 | + VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp2), asRegister(src2), asRegister(src1), 8); |
| 445 | + VexRVMIOp.EVPALIGNR.emit(masm, ZMM, asRegister(result), asRegister(tmp2), asRegister(tmp1), originInBytes - 16); |
| 446 | + } else if (originInBytes < 48) { |
| 447 | + VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp1), asRegister(src2), asRegister(src1), 8); |
| 448 | + VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp2), asRegister(src2), asRegister(src1), 12); |
| 449 | + VexRVMIOp.EVPALIGNR.emit(masm, ZMM, asRegister(result), asRegister(tmp2), asRegister(tmp1), originInBytes - 32); |
| 450 | + } else { |
| 451 | + VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp1), asRegister(src2), asRegister(src1), 12); |
| 452 | + VexRVMIOp.EVPALIGNR.emit(masm, ZMM, asRegister(result), asRegister(src2), asRegister(tmp1), originInBytes - 48); |
| 453 | + } |
| 454 | + } else { |
| 455 | + VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(result), asRegister(src2), asRegister(src1), originInBytes / Integer.BYTES); |
| 456 | + } |
| 457 | + } |
| 458 | + default -> GraalError.shouldNotReachHereUnexpectedValue(resultSize); |
| 459 | + } |
| 460 | + } |
| 461 | + |
| 462 | + private void allocateTempIfNecessary(AMD64LIRGenerator gen) { |
| 463 | + PlatformKind resultKind = result.getPlatformKind(); |
| 464 | + boolean needsTemp; |
| 465 | + if (resultKind.getSizeInBytes() < XMM.getBytes()) { |
| 466 | + needsTemp = !src1.equals(src2) || originInBytes % 2 != 0; |
| 467 | + } else if (resultKind.getSizeInBytes() == XMM.getBytes()) { |
| 468 | + needsTemp = false; |
| 469 | + } else if (encoding == AMD64SIMDInstructionEncoding.VEX) { |
| 470 | + needsTemp = true; |
| 471 | + } else { |
| 472 | + needsTemp = (originInBytes % Integer.BYTES != 0); |
| 473 | + } |
| 474 | + if (needsTemp) { |
| 475 | + tmp1 = gen.newVariable(LIRKind.value(resultKind)); |
| 476 | + } else { |
| 477 | + tmp1 = Value.ILLEGAL; |
| 478 | + } |
| 479 | + |
| 480 | + if (resultKind.getSizeInBytes() == ZMM.getBytes() && originInBytes % Integer.BYTES != 0 && |
| 481 | + originInBytes > 16 && originInBytes < 48) { |
| 482 | + GraalError.guarantee(!tmp1.equals(Value.ILLEGAL), "must have tmp1 with originInBytes = %d", originInBytes); |
| 483 | + tmp2 = gen.newVariable(LIRKind.value(resultKind)); |
| 484 | + } else { |
| 485 | + tmp2 = Value.ILLEGAL; |
| 486 | + } |
| 487 | + } |
| 488 | + } |
| 489 | + |
363 | 490 | public static final class IntToVectorOp extends AMD64LIRInstruction {
|
364 | 491 | public static final LIRInstructionClass<IntToVectorOp> TYPE = LIRInstructionClass.create(IntToVectorOp.class);
|
365 | 492 |
|
|
0 commit comments