Skip to content

Commit ccc1c63

Browse files
authored
fix(java): fix openj9 sliced string serde (#3160)
## Why? ## What does this PR do? ## Related issues Closes #2079 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark
1 parent a4320bc commit ccc1c63

File tree

3 files changed

+412
-12
lines changed

3 files changed

+412
-12
lines changed
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.fory.serializer;
21+
22+
import org.apache.fory.memory.LittleEndian;
23+
import org.apache.fory.memory.MemoryBuffer;
24+
import org.apache.fory.memory.Platform;
25+
import org.apache.fory.util.MathUtils;
26+
import org.apache.fory.util.StringEncodingUtils;
27+
import org.apache.fory.util.StringUtils;
28+
29+
final class SlicedStringUtil {
30+
private static final byte LATIN1 = 0;
31+
private static final byte UTF16 = 1;
32+
private static final byte UTF8 = 2;
33+
34+
private SlicedStringUtil() {}
35+
36+
static void writeCharsLatin1WithOffset(
37+
StringSerializer serializer, MemoryBuffer buffer, char[] chars, int offset, int count) {
38+
int writerIndex = buffer.writerIndex();
39+
long header = ((long) count << 2) | LATIN1;
40+
buffer.ensure(writerIndex + 5 + count);
41+
byte[] targetArray = buffer.getHeapMemory();
42+
if (targetArray != null) {
43+
final int targetIndex = buffer._unsafeHeapWriterIndex();
44+
int arrIndex = targetIndex;
45+
arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, header);
46+
writerIndex += arrIndex - targetIndex;
47+
for (int i = 0; i < count; i++) {
48+
targetArray[arrIndex + i] = (byte) chars[offset + i];
49+
}
50+
} else {
51+
writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
52+
final byte[] tmpArray = serializer.getByteArray(count);
53+
for (int i = 0; i < count; i++) {
54+
tmpArray[i] = (byte) chars[offset + i];
55+
}
56+
buffer.put(writerIndex, tmpArray, 0, count);
57+
}
58+
writerIndex += count;
59+
buffer._unsafeWriterIndex(writerIndex);
60+
}
61+
62+
static void writeCharsUTF16WithOffset(
63+
StringSerializer serializer, MemoryBuffer buffer, char[] chars, int offset, int count) {
64+
int numBytes = MathUtils.doubleExact(count);
65+
int writerIndex = buffer.writerIndex();
66+
long header = ((long) numBytes << 2) | UTF16;
67+
buffer.ensure(writerIndex + 5 + numBytes);
68+
final byte[] targetArray = buffer.getHeapMemory();
69+
if (targetArray != null) {
70+
final int targetIndex = buffer._unsafeHeapWriterIndex();
71+
int arrIndex = targetIndex;
72+
arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, header);
73+
writerIndex += arrIndex - targetIndex + numBytes;
74+
if (Platform.IS_LITTLE_ENDIAN) {
75+
// FIXME JDK11 utf16 string uses little-endian order.
76+
Platform.UNSAFE.copyMemory(
77+
chars,
78+
Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1),
79+
targetArray,
80+
Platform.BYTE_ARRAY_OFFSET + arrIndex,
81+
numBytes);
82+
} else {
83+
writeCharsUTF16BEToHeap(chars, offset, arrIndex, numBytes, targetArray);
84+
}
85+
} else {
86+
writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
87+
if (Platform.IS_LITTLE_ENDIAN) {
88+
writerIndex =
89+
offHeapWriteCharsUTF16WithOffset(
90+
serializer, buffer, chars, offset, writerIndex, numBytes);
91+
} else {
92+
writerIndex =
93+
offHeapWriteCharsUTF16BEWithOffset(
94+
serializer, buffer, chars, offset, writerIndex, numBytes);
95+
}
96+
}
97+
buffer._unsafeWriterIndex(writerIndex);
98+
}
99+
100+
static void writeCharsUTF8WithOffset(
101+
StringSerializer serializer, MemoryBuffer buffer, char[] chars, int offset, int count) {
102+
int estimateMaxBytes = count * 3;
103+
int approxNumBytes = (int) (count * 1.5) + 1;
104+
int writerIndex = buffer.writerIndex();
105+
buffer.ensure(writerIndex + 9 + estimateMaxBytes);
106+
byte[] targetArray = buffer.getHeapMemory();
107+
if (targetArray != null) {
108+
int targetIndex = buffer._unsafeHeapWriterIndex();
109+
int headerPos = targetIndex;
110+
int arrIndex = targetIndex;
111+
long header = ((long) approxNumBytes << 2) | UTF8;
112+
int headerBytesWritten = LittleEndian.putVarUint36Small(targetArray, arrIndex, header);
113+
arrIndex += headerBytesWritten;
114+
writerIndex += headerBytesWritten;
115+
targetIndex =
116+
StringEncodingUtils.convertUTF16ToUTF8(chars, offset, count, targetArray, arrIndex);
117+
byte stashedByte = targetArray[arrIndex];
118+
int written = targetIndex - arrIndex;
119+
header = ((long) written << 2) | UTF8;
120+
int diff =
121+
LittleEndian.putVarUint36Small(targetArray, headerPos, header) - headerBytesWritten;
122+
if (diff != 0) {
123+
handleWriteCharsUTF8UnalignedHeaderBytes(targetArray, arrIndex, diff, written, stashedByte);
124+
}
125+
buffer._unsafeWriterIndex(writerIndex + written + diff);
126+
} else {
127+
final byte[] tmpArray = serializer.getByteArray(estimateMaxBytes);
128+
int written = StringEncodingUtils.convertUTF16ToUTF8(chars, offset, count, tmpArray, 0);
129+
long header = ((long) written << 2) | UTF8;
130+
writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
131+
buffer.put(writerIndex, tmpArray, 0, written);
132+
buffer._unsafeWriterIndex(writerIndex + written);
133+
}
134+
}
135+
136+
static void writeCharsUTF8PerfOptimizedWithOffset(
137+
StringSerializer serializer, MemoryBuffer buffer, char[] chars, int offset, int count) {
138+
int estimateMaxBytes = count * 3;
139+
int numBytes = MathUtils.doubleExact(count);
140+
int writerIndex = buffer.writerIndex();
141+
long header = ((long) numBytes << 2) | UTF8;
142+
buffer.ensure(writerIndex + 9 + estimateMaxBytes);
143+
byte[] targetArray = buffer.getHeapMemory();
144+
if (targetArray != null) {
145+
int targetIndex = buffer._unsafeHeapWriterIndex();
146+
int arrIndex = targetIndex;
147+
arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, header);
148+
writerIndex += arrIndex - targetIndex;
149+
targetIndex =
150+
StringEncodingUtils.convertUTF16ToUTF8(chars, offset, count, targetArray, arrIndex + 4);
151+
int written = targetIndex - arrIndex - 4;
152+
buffer._unsafePutInt32(writerIndex, written);
153+
buffer._unsafeWriterIndex(writerIndex + 4 + written);
154+
} else {
155+
final byte[] tmpArray = serializer.getByteArray(estimateMaxBytes);
156+
int written = StringEncodingUtils.convertUTF16ToUTF8(chars, offset, count, tmpArray, 0);
157+
writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
158+
buffer._unsafePutInt32(writerIndex, written);
159+
writerIndex += 4;
160+
buffer.put(writerIndex, tmpArray, 0, written);
161+
buffer._unsafeWriterIndex(writerIndex + written);
162+
}
163+
}
164+
165+
static boolean isLatin(char[] chars, int offset, int count) {
166+
int end = offset + count;
167+
int vectorizedChars = count & ~3;
168+
int vectorEnd = offset + vectorizedChars;
169+
long byteOffset = Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1);
170+
long endOffset = Platform.CHAR_ARRAY_OFFSET + ((long) vectorEnd << 1);
171+
for (long off = byteOffset; off < endOffset; off += 8) {
172+
long multiChars = Platform.getLong(chars, off);
173+
if ((multiChars & StringUtils.MULTI_CHARS_NON_LATIN_MASK) != 0) {
174+
return false;
175+
}
176+
}
177+
for (int i = vectorEnd; i < end; i++) {
178+
if (chars[i] > 0xFF) {
179+
return false;
180+
}
181+
}
182+
return true;
183+
}
184+
185+
static byte bestCoder(char[] chars, int offset, int count) {
186+
int sampleNum = Math.min(64, count);
187+
int vectorizedLen = sampleNum >> 2;
188+
int vectorizedChars = vectorizedLen << 2;
189+
long byteOffset = Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1);
190+
long endOffset = byteOffset + ((long) vectorizedChars << 1);
191+
int asciiCount = 0;
192+
int latin1Count = 0;
193+
int charOffset = offset;
194+
for (long off = byteOffset; off < endOffset; off += 8, charOffset += 4) {
195+
long multiChars = Platform.getLong(chars, off);
196+
if ((multiChars & StringUtils.MULTI_CHARS_NON_ASCII_MASK) == 0) {
197+
latin1Count += 4;
198+
asciiCount += 4;
199+
} else if ((multiChars & StringUtils.MULTI_CHARS_NON_LATIN_MASK) == 0) {
200+
latin1Count += 4;
201+
for (int i = 0; i < 4; ++i) {
202+
if (chars[charOffset + i] < 0x80) {
203+
asciiCount++;
204+
}
205+
}
206+
} else {
207+
for (int i = 0; i < 4; ++i) {
208+
char c = chars[charOffset + i];
209+
if (c < 0x80) {
210+
latin1Count++;
211+
asciiCount++;
212+
} else if (c <= 0xFF) {
213+
latin1Count++;
214+
}
215+
}
216+
}
217+
}
218+
219+
for (int i = vectorizedChars; i < sampleNum; i++) {
220+
char c = chars[offset + i];
221+
if (c < 0x80) {
222+
latin1Count++;
223+
asciiCount++;
224+
} else if (c <= 0xFF) {
225+
latin1Count++;
226+
}
227+
}
228+
229+
if (latin1Count == count || (latin1Count == sampleNum && isLatin(chars, offset, count))) {
230+
return LATIN1;
231+
} else if (asciiCount >= sampleNum * 0.5) {
232+
return UTF8;
233+
} else {
234+
return UTF16;
235+
}
236+
}
237+
238+
private static void handleWriteCharsUTF8UnalignedHeaderBytes(
239+
byte[] targetArray, int arrIndex, int diff, int written, byte stashed) {
240+
if (diff == 1) {
241+
System.arraycopy(targetArray, arrIndex + 1, targetArray, arrIndex + 2, written - 1);
242+
targetArray[arrIndex + 1] = stashed;
243+
} else {
244+
System.arraycopy(targetArray, arrIndex, targetArray, arrIndex - 1, written);
245+
}
246+
}
247+
248+
private static void writeCharsUTF16BEToHeap(
249+
char[] chars, int offset, int arrIndex, int numBytes, byte[] targetArray) {
250+
int charIndex = offset;
251+
for (int i = arrIndex, end = i + numBytes; i < end; i += 2) {
252+
char c = chars[charIndex++];
253+
targetArray[i] = (byte) c;
254+
targetArray[i + 1] = (byte) (c >>> 8);
255+
}
256+
}
257+
258+
private static int offHeapWriteCharsUTF16WithOffset(
259+
StringSerializer serializer,
260+
MemoryBuffer buffer,
261+
char[] chars,
262+
int offset,
263+
int writerIndex,
264+
int numBytes) {
265+
byte[] tmpArray = serializer.getByteArray(numBytes);
266+
Platform.UNSAFE.copyMemory(
267+
chars,
268+
Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1),
269+
tmpArray,
270+
Platform.BYTE_ARRAY_OFFSET,
271+
numBytes);
272+
buffer.put(writerIndex, tmpArray, 0, numBytes);
273+
writerIndex += numBytes;
274+
return writerIndex;
275+
}
276+
277+
private static int offHeapWriteCharsUTF16BEWithOffset(
278+
StringSerializer serializer,
279+
MemoryBuffer buffer,
280+
char[] chars,
281+
int offset,
282+
int writerIndex,
283+
int numBytes) {
284+
byte[] tmpArray = serializer.getByteArray(numBytes);
285+
int charIndex = offset;
286+
for (int i = 0; i < numBytes; i += 2) {
287+
char c = chars[charIndex++];
288+
tmpArray[i] = (byte) c;
289+
tmpArray[i + 1] = (byte) (c >>> 8);
290+
}
291+
buffer.put(writerIndex, tmpArray, 0, numBytes);
292+
writerIndex += numBytes;
293+
return writerIndex;
294+
}
295+
}

0 commit comments

Comments
 (0)