|
33 | 33 | import java.util.concurrent.ThreadLocalRandom; |
34 | 34 | import java.util.stream.IntStream; |
35 | 35 |
|
| 36 | +import static io.airlift.slice.SliceUtf8.codePointByteLengths; |
36 | 37 | import static io.airlift.slice.SliceUtf8.codePointToUtf8; |
37 | 38 | import static io.airlift.slice.SliceUtf8.compareUtf16BE; |
38 | 39 | import static io.airlift.slice.SliceUtf8.countCodePoints; |
@@ -339,6 +340,125 @@ else if ((currentChar == '%') || (currentChar == '_')) { |
339 | 340 | return position; |
340 | 341 | } |
341 | 342 |
|
| 343 | + @Benchmark |
| 344 | + public int benchmarkTrinoPadStringCodePointLengths(TrinoPadData data) |
| 345 | + { |
| 346 | + Slice padString = data.getPadString(); |
| 347 | + int padStringLength = countCodePoints(padString); |
| 348 | + int[] padStringCounts = new int[padStringLength]; |
| 349 | + for (int index = 0; index < padStringLength; index++) { |
| 350 | + padStringCounts[index] = lengthOfCodePointSafe(padString, offsetOfCodePoint(padString, index)); |
| 351 | + } |
| 352 | + return checksum(padStringCounts); |
| 353 | + } |
| 354 | + |
| 355 | + @Benchmark |
| 356 | + public int benchmarkTrinoPadStringCodePointLengthsSinglePass(TrinoPadData data) |
| 357 | + { |
| 358 | + Slice padString = data.getPadString(); |
| 359 | + int[] padStringCounts = new int[countCodePoints(padString)]; |
| 360 | + int position = 0; |
| 361 | + int index = 0; |
| 362 | + while (position < padString.length()) { |
| 363 | + int codePoint = getCodePointAt(padString, position); |
| 364 | + int codePointLength = lengthOfCodePoint(codePoint); |
| 365 | + padStringCounts[index] = codePointLength; |
| 366 | + index++; |
| 367 | + position += codePointLength; |
| 368 | + } |
| 369 | + if (index != padStringCounts.length) { |
| 370 | + throw new AssertionError(); |
| 371 | + } |
| 372 | + return checksum(padStringCounts); |
| 373 | + } |
| 374 | + |
| 375 | + @Benchmark |
| 376 | + public int benchmarkTrinoPadStringCodePointLengthsByteArray(TrinoPadData data) |
| 377 | + { |
| 378 | + byte[] utf8 = data.getUtf8(); |
| 379 | + int baseOffset = data.getOffset(); |
| 380 | + int byteLength = data.getByteLength(); |
| 381 | + int[] padStringCounts = new int[countCodePoints(utf8, baseOffset, byteLength)]; |
| 382 | + int position = 0; |
| 383 | + int index = 0; |
| 384 | + while (position < byteLength) { |
| 385 | + int codePoint = getCodePointAt(utf8, baseOffset, byteLength, position); |
| 386 | + int codePointLength = lengthOfCodePoint(codePoint); |
| 387 | + padStringCounts[index] = codePointLength; |
| 388 | + index++; |
| 389 | + position += codePointLength; |
| 390 | + } |
| 391 | + if (index != padStringCounts.length) { |
| 392 | + throw new AssertionError(); |
| 393 | + } |
| 394 | + return checksum(padStringCounts); |
| 395 | + } |
| 396 | + |
| 397 | + @Benchmark |
| 398 | + public int benchmarkTrinoPadStringCodePointLengthsSliceUtf8Helper(TrinoPadData data) |
| 399 | + { |
| 400 | + return checksum(codePointByteLengths(data.getPadString())); |
| 401 | + } |
| 402 | + |
| 403 | + @Benchmark |
| 404 | + public int benchmarkTrinoPadStringCodePointLengthsSliceUtf8HelperByteArray(TrinoPadData data) |
| 405 | + { |
| 406 | + return checksum(codePointByteLengths(data.getUtf8(), data.getOffset(), data.getByteLength())); |
| 407 | + } |
| 408 | + |
| 409 | + @Benchmark |
| 410 | + public Slice benchmarkTrinoDomainTranslatorPrefixRange(TrinoPrefixRangeData data) |
| 411 | + { |
| 412 | + Slice constantPrefix = data.getConstantPrefix(); |
| 413 | + |
| 414 | + int lastIncrementable = -1; |
| 415 | + for (int position = 0; position < constantPrefix.length(); position += lengthOfCodePoint(constantPrefix, position)) { |
| 416 | + if (getCodePointAt(constantPrefix, position) < 127) { |
| 417 | + lastIncrementable = position; |
| 418 | + } |
| 419 | + } |
| 420 | + |
| 421 | + if (lastIncrementable == -1) { |
| 422 | + return Slices.EMPTY_SLICE; |
| 423 | + } |
| 424 | + |
| 425 | + Slice upperBound = constantPrefix.slice(0, lastIncrementable + lengthOfCodePoint(constantPrefix, lastIncrementable)).copy(); |
| 426 | + setCodePointAt(getCodePointAt(constantPrefix, lastIncrementable) + 1, upperBound, lastIncrementable); |
| 427 | + return upperBound; |
| 428 | + } |
| 429 | + |
| 430 | + @Benchmark |
| 431 | + public Slice benchmarkTrinoDomainTranslatorPrefixRangeSingleDecode(TrinoPrefixRangeData data) |
| 432 | + { |
| 433 | + byte[] utf8 = data.getUtf8(); |
| 434 | + int baseOffset = data.getOffset(); |
| 435 | + int byteLength = data.getByteLength(); |
| 436 | + Slice constantPrefix = data.getConstantPrefix(); |
| 437 | + |
| 438 | + int lastIncrementableOffset = -1; |
| 439 | + int lastIncrementableCodePoint = -1; |
| 440 | + int lastIncrementableLength = 0; |
| 441 | + int position = 0; |
| 442 | + while (position < byteLength) { |
| 443 | + int codePoint = getCodePointAt(utf8, baseOffset, byteLength, position); |
| 444 | + int codePointLength = lengthOfCodePoint(codePoint); |
| 445 | + if (codePoint < 127) { |
| 446 | + lastIncrementableOffset = position; |
| 447 | + lastIncrementableCodePoint = codePoint; |
| 448 | + lastIncrementableLength = codePointLength; |
| 449 | + } |
| 450 | + position += codePointLength; |
| 451 | + } |
| 452 | + |
| 453 | + if (lastIncrementableOffset == -1) { |
| 454 | + return Slices.EMPTY_SLICE; |
| 455 | + } |
| 456 | + |
| 457 | + Slice upperBound = constantPrefix.slice(0, lastIncrementableOffset + lastIncrementableLength).copy(); |
| 458 | + setCodePointAt(lastIncrementableCodePoint + 1, upperBound, lastIncrementableOffset); |
| 459 | + return upperBound; |
| 460 | + } |
| 461 | + |
342 | 462 | @Benchmark |
343 | 463 | public int benchmarkCompareUtf16BE(CompareData data) |
344 | 464 | { |
@@ -452,6 +572,24 @@ public int benchmarkCodePointToUtf8(CodePointWriteData data) |
452 | 572 | return totalBytes; |
453 | 573 | } |
454 | 574 |
|
| 575 | + private static int checksum(int[] values) |
| 576 | + { |
| 577 | + int checksum = 1; |
| 578 | + for (int value : values) { |
| 579 | + checksum = (31 * checksum) ^ value; |
| 580 | + } |
| 581 | + return checksum; |
| 582 | + } |
| 583 | + |
| 584 | + private static int checksum(byte[] values) |
| 585 | + { |
| 586 | + int checksum = 1; |
| 587 | + for (byte value : values) { |
| 588 | + checksum = (31 * checksum) ^ value; |
| 589 | + } |
| 590 | + return checksum; |
| 591 | + } |
| 592 | + |
455 | 593 | @State(Thread) |
456 | 594 | public static class BenchmarkData |
457 | 595 | { |
@@ -814,6 +952,120 @@ public int getEscapeChar() |
814 | 952 | } |
815 | 953 | } |
816 | 954 |
|
| 955 | + @State(Thread) |
| 956 | + public static class TrinoPadData |
| 957 | + { |
| 958 | + @Param("128") |
| 959 | + private int length; |
| 960 | + |
| 961 | + @Param({"true", "false"}) |
| 962 | + private boolean ascii; |
| 963 | + |
| 964 | + private byte[] utf8; |
| 965 | + private int offset; |
| 966 | + private int byteLength; |
| 967 | + private Slice padString; |
| 968 | + |
| 969 | + @Setup |
| 970 | + public void setup() |
| 971 | + { |
| 972 | + int[] codePointSet = ascii ? BenchmarkData.ASCII_CODE_POINTS : BenchmarkData.ALL_CODE_POINTS; |
| 973 | + ThreadLocalRandom random = ThreadLocalRandom.current(); |
| 974 | + DynamicSliceOutput out = new DynamicSliceOutput(length * 4); |
| 975 | + for (int index = 0; index < length; index++) { |
| 976 | + int codePoint = codePointSet[random.nextInt(codePointSet.length)]; |
| 977 | + out.appendBytes(new String(Character.toChars(codePoint)).getBytes(StandardCharsets.UTF_8)); |
| 978 | + } |
| 979 | + |
| 980 | + byte[] encoded = out.slice().getBytes(); |
| 981 | + offset = 9; |
| 982 | + utf8 = new byte[offset + encoded.length + 3]; |
| 983 | + System.arraycopy(encoded, 0, utf8, offset, encoded.length); |
| 984 | + byteLength = encoded.length; |
| 985 | + padString = Slices.wrappedBuffer(utf8, offset, byteLength); |
| 986 | + } |
| 987 | + |
| 988 | + public byte[] getUtf8() |
| 989 | + { |
| 990 | + return utf8; |
| 991 | + } |
| 992 | + |
| 993 | + public int getOffset() |
| 994 | + { |
| 995 | + return offset; |
| 996 | + } |
| 997 | + |
| 998 | + public int getByteLength() |
| 999 | + { |
| 1000 | + return byteLength; |
| 1001 | + } |
| 1002 | + |
| 1003 | + public Slice getPadString() |
| 1004 | + { |
| 1005 | + return padString; |
| 1006 | + } |
| 1007 | + } |
| 1008 | + |
| 1009 | + @State(Thread) |
| 1010 | + public static class TrinoPrefixRangeData |
| 1011 | + { |
| 1012 | + @Param("256") |
| 1013 | + private int length; |
| 1014 | + |
| 1015 | + @Param({"true", "false"}) |
| 1016 | + private boolean ascii; |
| 1017 | + |
| 1018 | + private byte[] utf8; |
| 1019 | + private int offset; |
| 1020 | + private int byteLength; |
| 1021 | + private Slice constantPrefix; |
| 1022 | + |
| 1023 | + @Setup |
| 1024 | + public void setup() |
| 1025 | + { |
| 1026 | + int[] codePointSet = ascii ? BenchmarkData.ASCII_CODE_POINTS : BenchmarkData.ALL_CODE_POINTS; |
| 1027 | + ThreadLocalRandom random = ThreadLocalRandom.current(); |
| 1028 | + |
| 1029 | + int[] codePoints = new int[length]; |
| 1030 | + codePoints[0] = 'a'; |
| 1031 | + for (int index = 1; index < codePoints.length; index++) { |
| 1032 | + codePoints[index] = codePointSet[random.nextInt(codePointSet.length)]; |
| 1033 | + } |
| 1034 | + |
| 1035 | + DynamicSliceOutput out = new DynamicSliceOutput(length * 4); |
| 1036 | + for (int codePoint : codePoints) { |
| 1037 | + out.appendBytes(new String(Character.toChars(codePoint)).getBytes(StandardCharsets.UTF_8)); |
| 1038 | + } |
| 1039 | + |
| 1040 | + byte[] encoded = out.slice().getBytes(); |
| 1041 | + offset = 13; |
| 1042 | + utf8 = new byte[offset + encoded.length + 5]; |
| 1043 | + System.arraycopy(encoded, 0, utf8, offset, encoded.length); |
| 1044 | + byteLength = encoded.length; |
| 1045 | + constantPrefix = Slices.wrappedBuffer(utf8, offset, byteLength); |
| 1046 | + } |
| 1047 | + |
| 1048 | + public byte[] getUtf8() |
| 1049 | + { |
| 1050 | + return utf8; |
| 1051 | + } |
| 1052 | + |
| 1053 | + public int getOffset() |
| 1054 | + { |
| 1055 | + return offset; |
| 1056 | + } |
| 1057 | + |
| 1058 | + public int getByteLength() |
| 1059 | + { |
| 1060 | + return byteLength; |
| 1061 | + } |
| 1062 | + |
| 1063 | + public Slice getConstantPrefix() |
| 1064 | + { |
| 1065 | + return constantPrefix; |
| 1066 | + } |
| 1067 | + } |
| 1068 | + |
817 | 1069 | @State(Thread) |
818 | 1070 | public static class CodePointWriteData |
819 | 1071 | { |
|
0 commit comments