Skip to content

Commit a4320bc

Browse files
authored
fix(java): use littlen endian for utf16 string on big endian (#3159)
## Why? ## What does this PR do? ## Related issues Closes #2440 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark
1 parent 47a73a1 commit a4320bc

File tree

1 file changed

+111
-33
lines changed

1 file changed

+111
-33
lines changed

java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java

Lines changed: 111 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,12 @@ public String readBytesString(MemoryBuffer buffer) {
196196
long header = buffer.readVarUint36Small();
197197
byte coder = (byte) (header & 0b11);
198198
int numBytes = (int) (header >>> 2);
199-
byte[] bytes = readBytesUnCompressedUTF16(buffer, numBytes);
199+
byte[] bytes;
200+
if (!Platform.IS_LITTLE_ENDIAN && coder == UTF16) {
201+
bytes = readBytesUTF16BE(buffer, numBytes);
202+
} else {
203+
bytes = readBytesUnCompressedUTF16(buffer, numBytes);
204+
}
200205
if (coder != UTF8) {
201206
return newBytesStringZeroCopy(coder, bytes);
202207
} else {
@@ -236,8 +241,16 @@ public String readCompressedBytesString(MemoryBuffer buffer) {
236241
data = readBytesUTF8(buffer, numBytes);
237242
}
238243
return newBytesStringZeroCopy(UTF16, data);
239-
} else if (coder == LATIN1 || coder == UTF16) {
244+
} else if (coder == LATIN1) {
240245
return newBytesStringZeroCopy(coder, readBytesUnCompressedUTF16(buffer, numBytes));
246+
} else if (coder == UTF16) {
247+
byte[] bytes;
248+
if (Platform.IS_LITTLE_ENDIAN) {
249+
bytes = readBytesUnCompressedUTF16(buffer, numBytes);
250+
} else {
251+
bytes = readBytesUTF16BE(buffer, numBytes);
252+
}
253+
return newBytesStringZeroCopy(coder, bytes);
241254
} else {
242255
throw new RuntimeException("Unknown coder type " + coder);
243256
}
@@ -398,6 +411,10 @@ public static void writeBytesString(MemoryBuffer buffer, String value) {
398411
}
399412

400413
public static void writeBytesString(MemoryBuffer buffer, byte coder, byte[] bytes) {
414+
if (!Platform.IS_LITTLE_ENDIAN && coder == UTF16) {
415+
writeBytesStringUTF16BE(buffer, bytes);
416+
return;
417+
}
401418
int bytesLen = bytes.length;
402419
long header = ((long) bytesLen << 2) | coder;
403420
int writerIndex = buffer.writerIndex();
@@ -507,37 +524,14 @@ public byte[] readBytesUnCompressedUTF16(MemoryBuffer buffer, int numBytes) {
507524
}
508525

509526
public char[] readCharsUTF16(MemoryBuffer buffer, int numBytes) {
510-
char[] chars = new char[numBytes >> 1];
511527
if (Platform.IS_LITTLE_ENDIAN) {
528+
char[] chars = new char[numBytes >> 1];
512529
// FIXME JDK11 utf16 string uses little-endian order.
513530
buffer.readChars(chars, Platform.CHAR_ARRAY_OFFSET, numBytes);
531+
return chars;
514532
} else {
515-
buffer.checkReadableBytes(numBytes);
516-
final byte[] targetArray = buffer.getHeapMemory();
517-
if (targetArray != null) {
518-
int charIndex = 0;
519-
for (int i = buffer._unsafeHeapReaderIndex(), end = i + numBytes; i < end; i += 2) {
520-
char c =
521-
(char)
522-
((targetArray[i] & 0xff << StringUTF16.HI_BYTE_SHIFT)
523-
| ((targetArray[i + 1] & 0xff) << StringUTF16.LO_BYTE_SHIFT));
524-
chars[charIndex++] = c;
525-
}
526-
buffer._increaseReaderIndexUnsafe(numBytes);
527-
} else {
528-
final byte[] tmpArray = getByteArray(numBytes);
529-
buffer.readBytes(tmpArray, 0, numBytes);
530-
int charIndex = 0;
531-
for (int i = 0; i < numBytes; i += 2) {
532-
char c =
533-
(char)
534-
((tmpArray[i] & 0xff << StringUTF16.HI_BYTE_SHIFT)
535-
| ((tmpArray[i + 1] & 0xff) << StringUTF16.LO_BYTE_SHIFT));
536-
chars[charIndex++] = c;
537-
}
538-
}
533+
return readCharsUTF16BE(buffer, numBytes);
539534
}
540-
return chars;
541535
}
542536

543537
public String readCharsUTF8(MemoryBuffer buffer, int numBytes) {
@@ -623,11 +617,15 @@ public void writeCharsUTF16(MemoryBuffer buffer, char[] chars, int numChars) {
623617
Platform.BYTE_ARRAY_OFFSET + arrIndex,
624618
numBytes);
625619
} else {
626-
heapWriteCharsUTF16BE(chars, arrIndex, numBytes, targetArray);
620+
writeCharsUTF16BEToHeap(chars, arrIndex, numBytes, targetArray);
627621
}
628622
} else {
629623
writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
630-
writerIndex = offHeapWriteCharsUTF16(buffer, chars, writerIndex, numBytes);
624+
if (Platform.IS_LITTLE_ENDIAN) {
625+
writerIndex = offHeapWriteCharsUTF16(buffer, chars, writerIndex, numBytes);
626+
} else {
627+
writerIndex = offHeapWriteCharsUTF16BE(buffer, chars, writerIndex, numBytes);
628+
}
631629
}
632630
buffer._unsafeWriterIndex(writerIndex);
633631
}
@@ -909,14 +907,14 @@ private static MethodHandle getJavaStringZeroCopyCtrHandle() {
909907
}
910908
}
911909

912-
private static void heapWriteCharsUTF16BE(
910+
private static void writeCharsUTF16BEToHeap(
913911
char[] chars, int arrIndex, int numBytes, byte[] targetArray) {
914912
// Write to heap memory then copy is 250% faster than unsafe write to direct memory.
915913
int charIndex = 0;
916914
for (int i = arrIndex, end = i + numBytes; i < end; i += 2) {
917915
char c = chars[charIndex++];
918-
targetArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT);
919-
targetArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT);
916+
targetArray[i] = (byte) c;
917+
targetArray[i + 1] = (byte) (c >>> 8);
920918
}
921919
}
922920

@@ -934,6 +932,86 @@ private int offHeapWriteCharsUTF16(
934932
return writerIndex;
935933
}
936934

935+
private int offHeapWriteCharsUTF16BE(
936+
MemoryBuffer buffer, char[] chars, int writerIndex, int numBytes) {
937+
byte[] tmpArray = getByteArray(numBytes);
938+
int charIndex = 0;
939+
for (int i = 0; i < numBytes; i += 2) {
940+
char c = chars[charIndex++];
941+
tmpArray[i] = (byte) c;
942+
tmpArray[i + 1] = (byte) (c >>> 8);
943+
}
944+
buffer.put(writerIndex, tmpArray, 0, numBytes);
945+
writerIndex += numBytes;
946+
return writerIndex;
947+
}
948+
949+
private char[] readCharsUTF16BE(MemoryBuffer buffer, int numBytes) {
950+
buffer.checkReadableBytes(numBytes);
951+
final byte[] targetArray = buffer.getHeapMemory();
952+
char[] chars = new char[numBytes >> 1];
953+
if (targetArray != null) {
954+
int charIndex = 0;
955+
for (int i = buffer._unsafeHeapReaderIndex(), end = i + numBytes; i < end; i += 2) {
956+
int lo = targetArray[i] & 0xff;
957+
int hi = targetArray[i + 1] & 0xff;
958+
chars[charIndex++] = (char) (lo | (hi << 8));
959+
}
960+
buffer._increaseReaderIndexUnsafe(numBytes);
961+
} else {
962+
final byte[] tmpArray = getByteArray(numBytes);
963+
buffer.readBytes(tmpArray, 0, numBytes);
964+
int charIndex = 0;
965+
for (int i = 0; i < numBytes; i += 2) {
966+
int lo = tmpArray[i] & 0xff;
967+
int hi = tmpArray[i + 1] & 0xff;
968+
chars[charIndex++] = (char) (lo | (hi << 8));
969+
}
970+
}
971+
return chars;
972+
}
973+
974+
private byte[] readBytesUTF16BE(MemoryBuffer buffer, int numBytes) {
975+
byte[] bytes = readBytesUnCompressedUTF16(buffer, numBytes);
976+
swapUTF16BytesInPlace(bytes);
977+
return bytes;
978+
}
979+
980+
private static void swapUTF16BytesInPlace(byte[] bytes) {
981+
for (int i = 0; i < bytes.length; i += 2) {
982+
byte tmp = bytes[i];
983+
bytes[i] = bytes[i + 1];
984+
bytes[i + 1] = tmp;
985+
}
986+
}
987+
988+
private static void writeBytesStringUTF16BE(MemoryBuffer buffer, byte[] bytes) {
989+
int bytesLen = bytes.length;
990+
long header = ((long) bytesLen << 2) | UTF16;
991+
int writerIndex = buffer.writerIndex();
992+
buffer.ensure(writerIndex + 9 + bytesLen);
993+
final byte[] targetArray = buffer.getHeapMemory();
994+
if (targetArray != null) {
995+
final int targetIndex = buffer._unsafeHeapWriterIndex();
996+
int arrIndex = targetIndex;
997+
arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, header);
998+
writerIndex += arrIndex - targetIndex;
999+
for (int i = 0; i < bytesLen; i += 2) {
1000+
targetArray[arrIndex + i] = bytes[i + 1];
1001+
targetArray[arrIndex + i + 1] = bytes[i];
1002+
}
1003+
} else {
1004+
writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
1005+
byte[] tmpArray = new byte[bytesLen];
1006+
for (int i = 0; i < bytesLen; i += 2) {
1007+
tmpArray[i] = bytes[i + 1];
1008+
tmpArray[i + 1] = bytes[i];
1009+
}
1010+
buffer.put(writerIndex, tmpArray, 0, bytesLen);
1011+
}
1012+
buffer._unsafeWriterIndex(writerIndex + bytesLen);
1013+
}
1014+
9371015
private static byte bestCoder(char[] chars) {
9381016
int numChars = chars.length;
9391017
// sample 64 chars

0 commit comments

Comments
 (0)