Skip to content

Commit 2940ef9

Browse files
committed
Improve ASCII performance
1 parent beaf333 commit 2940ef9

File tree

3 files changed

+243
-60
lines changed

3 files changed

+243
-60
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java

Lines changed: 185 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,19 @@
55
import java.math.BigInteger;
66
import java.nio.charset.Charset;
77
import java.nio.charset.StandardCharsets;
8-
import java.util.*;
8+
import java.util.ArrayList;
9+
import java.util.Arrays;
10+
import java.util.Stack;
911

1012
import com.fasterxml.jackson.core.*;
1113
import com.fasterxml.jackson.core.base.ParserMinimalBase;
1214
import com.fasterxml.jackson.core.io.IOContext;
1315
import com.fasterxml.jackson.core.io.NumberInput;
1416
import com.fasterxml.jackson.core.json.DupDetector;
1517
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
16-
import com.fasterxml.jackson.core.util.*;
18+
import com.fasterxml.jackson.core.util.ByteArrayBuilder;
19+
import com.fasterxml.jackson.core.util.JacksonFeatureSet;
20+
import com.fasterxml.jackson.core.util.TextBuffer;
1721

1822
import static com.fasterxml.jackson.dataformat.cbor.CBORConstants.*;
1923

@@ -328,6 +332,11 @@ public int getFirstTag() {
328332
*/
329333
protected int _typeByte;
330334

335+
/**
336+
* A pointer to know where to write text when we share an output buffer across methods
337+
*/
338+
protected int _sharedOutBufferPtr;
339+
331340
/**
332341
* Type to keep track of a list of string references. A depth is stored to know when to pop the
333342
* references off the stack for nested namespaces.
@@ -2289,10 +2298,9 @@ protected void _finishToken() throws IOException
22892298

22902299
if ((available >= len)
22912300
// if not, could we read? NOTE: we do not require it, just attempt to read
2292-
|| ((_inputBuffer.length >= len)
2293-
&& _tryToLoadToHaveAtLeast(len))) {
2294-
_finishShortText(len);
2295-
return;
2301+
|| _tryToLoadToHaveAtLeast(len)) {
2302+
_finishShortText(len);
2303+
return;
22962304
}
22972305
// If not enough space, need handling similar to chunked
22982306
_finishLongText(len);
@@ -2331,11 +2339,9 @@ protected String _finishTextToken(int ch) throws IOException
23312339
// due to inputBuffer never being even close to that big).
23322340

23332341
final int available = _inputEnd - _inputPtr;
2334-
23352342
if ((available >= len)
23362343
// if not, could we read? NOTE: we do not require it, just attempt to read
2337-
|| ((_inputBuffer.length >= len)
2338-
&& _tryToLoadToHaveAtLeast(len))) {
2344+
|| _tryToLoadToHaveAtLeast(len)) {
23392345
return _finishShortText(len);
23402346
}
23412347
// If not enough space, need handling similar to chunked
@@ -2364,19 +2370,22 @@ private final String _finishShortText(int len) throws IOException
23642370

23652371
// Let's actually do a tight loop for ASCII first:
23662372
final int end = _inputPtr;
2367-
2368-
int i;
2369-
while ((i = inputBuf[inPtr]) >= 0) {
2373+
int i = 0;
2374+
while (inPtr < end && i >= 0) {
2375+
i = inputBuf[inPtr++];
23702376
outBuf[outPtr++] = (char) i;
2371-
if (++inPtr == end) {
2372-
String str = _textBuffer.setCurrentAndReturn(outPtr);
2373-
if (stringRefs != null) {
2374-
stringRefs.stringRefs.add(str);
2375-
_sharedString = str;
2376-
}
2377-
return str;
2377+
}
2378+
if (inPtr == end && i >= 0) {
2379+
String str = _textBuffer.setCurrentAndReturn(outPtr);
2380+
if (stringRefs != null) {
2381+
stringRefs.stringRefs.add(str);
2382+
_sharedString = str;
23782383
}
2384+
return str;
23792385
}
2386+
// Correct extra increments
2387+
outPtr -= 1;
2388+
inPtr -= 1;
23802389
final int[] codes = UTF8_UNIT_CODES;
23812390
do {
23822391
i = inputBuf[inPtr++] & 0xFF;
@@ -2443,10 +2452,17 @@ private final String _finishShortText(int len) throws IOException
24432452

24442453
private final String _finishLongText(int len) throws IOException
24452454
{
2446-
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
2447-
int outPtr = 0;
2448-
final int[] codes = UTF8_UNIT_CODES;
2455+
StringRefList stringRefs = null;
2456+
if (!_stringRefs.empty() &&
2457+
shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) {
2458+
stringRefs = _stringRefs.peek();
2459+
}
2460+
// First a tight loop for ASCII.
2461+
len = _finishLongTextAscii(len);
2462+
char[] outBuf = _textBuffer.getBufferWithoutReset();
2463+
int outPtr = _sharedOutBufferPtr;
24492464
int outEnd = outBuf.length;
2465+
final int[] codes = UTF8_UNIT_CODES;
24502466

24512467
while (--len >= 0) {
24522468
int c = _nextByte() & 0xFF;
@@ -2500,14 +2516,52 @@ private final String _finishLongText(int len) throws IOException
25002516
outBuf[outPtr++] = (char) c;
25012517
}
25022518
String str = _textBuffer.setCurrentAndReturn(outPtr);
2503-
if (!_stringRefs.empty() &&
2504-
shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) {
2505-
_stringRefs.peek().stringRefs.add(str);
2519+
if (stringRefs != null) {
2520+
stringRefs.stringRefs.add(str);
25062521
_sharedString = str;
25072522
}
25082523
return str;
25092524
}
25102525

2526+
/**
2527+
* Consumes as many ascii chars as possible in a tight loop. Returns the amount of bytes remaining.
2528+
*/
2529+
private final int _finishLongTextAscii(int len) throws IOException
2530+
{
2531+
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
2532+
final byte[] input = _inputBuffer;
2533+
_sharedOutBufferPtr = 0;
2534+
while (len > 0) {
2535+
// load as much input as possible
2536+
int size = Math.min(len, Math.min(outBuf.length, input.length));
2537+
if (!_tryToLoadToHaveAtLeast(size)) {
2538+
_sharedOutBufferPtr = 0;
2539+
return len;
2540+
}
2541+
int outEnd = size;
2542+
int outPtr = 0;
2543+
int inPtr = _inputPtr;
2544+
int i = 0;
2545+
// Tight loop to copy into the output buffer, bail if a non-ascii char is found
2546+
while (outPtr < outEnd && i >= 0) {
2547+
i = input[inPtr++];
2548+
outBuf[outPtr++] = (char) i;
2549+
}
2550+
// Found a non-ascii char, correct pointers and return to the caller.
2551+
if (i < 0) {
2552+
_inputPtr = inPtr - 1;
2553+
_sharedOutBufferPtr = outPtr - 1;
2554+
return len - _sharedOutBufferPtr;
2555+
}
2556+
_inputPtr = inPtr;
2557+
if (outPtr >= outBuf.length) {
2558+
outBuf = _textBuffer.finishCurrentSegment();
2559+
}
2560+
len -= size;
2561+
}
2562+
return len;
2563+
}
2564+
25112565
private final void _finishChunkedText() throws IOException
25122566
{
25132567
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
@@ -2532,7 +2586,6 @@ private final void _finishChunkedText() throws IOException
25322586
}
25332587
break;
25342588
}
2535-
_chunkLeft = len;
25362589
int end = _inputPtr + len;
25372590
if (end <= _inputEnd) { // all within buffer
25382591
_chunkLeft = 0;
@@ -2541,19 +2594,22 @@ private final void _finishChunkedText() throws IOException
25412594
_chunkLeft = (end - _inputEnd);
25422595
_chunkEnd = _inputEnd;
25432596
}
2544-
}
2545-
// besides of which just need to ensure there's content
2546-
if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk
2547-
loadMoreGuaranteed();
2548-
int end = _inputPtr + _chunkLeft;
2549-
if (end <= _inputEnd) { // all within buffer
2550-
_chunkLeft = 0;
2551-
_chunkEnd = end;
2552-
} else { // stretches beyond
2553-
_chunkLeft = (end - _inputEnd);
2554-
_chunkEnd = _inputEnd;
2597+
// start of a new chunk
2598+
// First a tight loop for ASCII.
2599+
_sharedOutBufferPtr = outPtr;
2600+
if (_finishChunkedTextAscii()) {
2601+
// chunk fully consumed, let's get the next one
2602+
outBuf = _textBuffer.getBufferWithoutReset();
2603+
outPtr = _sharedOutBufferPtr;
2604+
outEnd = outBuf.length;
2605+
continue;
25552606
}
2607+
outBuf = _textBuffer.getBufferWithoutReset();
2608+
outEnd = outBuf.length;
2609+
outPtr = _sharedOutBufferPtr;
25562610
}
2611+
// besides of which just need to ensure there's content
2612+
_loadMoreForChunkIfNeeded();
25572613
}
25582614
int c = input[_inputPtr++] & 0xFF;
25592615
int code = codes[c];
@@ -2563,9 +2619,9 @@ private final void _finishChunkedText() throws IOException
25632619
}
25642620

25652621
switch (code) {
2566-
case 0:
2567-
break;
2568-
case 1: // 2-byte UTF
2622+
case 0:
2623+
break;
2624+
case 1: // 2-byte UTF
25692625
{
25702626
int d = _nextChunkedByte();
25712627
if ((d & 0xC0) != 0x080) {
@@ -2574,24 +2630,24 @@ private final void _finishChunkedText() throws IOException
25742630
c = ((c & 0x1F) << 6) | (d & 0x3F);
25752631
}
25762632
break;
2577-
case 2: // 3-byte UTF
2578-
c = _decodeChunkedUTF8_3(c);
2579-
break;
2580-
case 3: // 4-byte UTF
2581-
c = _decodeChunkedUTF8_4(c);
2582-
// Let's add first part right away:
2583-
if (outPtr >= outBuf.length) {
2584-
outBuf = _textBuffer.finishCurrentSegment();
2585-
outPtr = 0;
2586-
outEnd = outBuf.length;
2587-
}
2588-
outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
2589-
c = 0xDC00 | (c & 0x3FF);
2590-
// And let the other char output down below
2591-
break;
2592-
default:
2593-
// Is this good enough error message?
2594-
_reportInvalidInitial(c);
2633+
case 2: // 3-byte UTF
2634+
c = _decodeChunkedUTF8_3(c);
2635+
break;
2636+
case 3: // 4-byte UTF
2637+
c = _decodeChunkedUTF8_4(c);
2638+
// Let's add first part right away:
2639+
if (outPtr >= outBuf.length) {
2640+
outBuf = _textBuffer.finishCurrentSegment();
2641+
outPtr = 0;
2642+
outEnd = outBuf.length;
2643+
}
2644+
outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
2645+
c = 0xDC00 | (c & 0x3FF);
2646+
// And let the other char output down below
2647+
break;
2648+
default:
2649+
// Is this good enough error message?
2650+
_reportInvalidInitial(c);
25952651
}
25962652
// Need more room?
25972653
if (outPtr >= outEnd) {
@@ -2602,9 +2658,76 @@ private final void _finishChunkedText() throws IOException
26022658
// Ok, let's add char to output:
26032659
outBuf[outPtr++] = (char) c;
26042660
}
2661+
26052662
_textBuffer.setCurrentLength(outPtr);
26062663
}
26072664

2665+
/**
2666+
* Reads in a tight loop ASCII text until a non-ASCII char is found. If any, then it returns false to signal the
2667+
* caller that the chunk wasn't finished. The caller will keep adding to the _outBuf at the _outPtr position to
2668+
* finish the current text buffer segment
2669+
*/
2670+
private final boolean _finishChunkedTextAscii() throws IOException
2671+
{
2672+
final byte[] input = _inputBuffer;
2673+
int outPtr = _sharedOutBufferPtr;
2674+
char[] outBuf = _textBuffer.getBufferWithoutReset();
2675+
int outEnd = outBuf.length;
2676+
while (true) {
2677+
// besides of which just need to ensure there's content
2678+
_loadMoreForChunkIfNeeded();
2679+
2680+
// Find the size of the loop
2681+
int inSize = _chunkEnd - _inputPtr;
2682+
int outSize = outEnd - outPtr;
2683+
int inputPtr = _inputPtr;
2684+
int inputPtrEnd = _inputPtr + Math.min(inSize, outSize);
2685+
int i = 0;
2686+
// loop with copying what we can.
2687+
while (inputPtr < inputPtrEnd && i >= 0) {
2688+
i = input[inputPtr++];
2689+
char val = (char) i;
2690+
outBuf[outPtr++] = val;
2691+
}
2692+
_inputPtr = inputPtr;
2693+
2694+
if (i < 0) {
2695+
// Found a non-ascii char, correct pointers and return to the caller.
2696+
outPtr -= 1;
2697+
_inputPtr -= 1;
2698+
_sharedOutBufferPtr = outPtr;
2699+
// return false to signal this to the calling code to allow the multi-byte code-path to kick.
2700+
return false;
2701+
}
2702+
// Need more room?
2703+
if (outPtr >= outEnd) {
2704+
outBuf = _textBuffer.finishCurrentSegment();
2705+
outPtr = 0;
2706+
outEnd = outBuf.length;
2707+
}
2708+
if (_inputPtr < _chunkEnd || _chunkLeft > 0) {
2709+
continue;
2710+
}
2711+
_sharedOutBufferPtr = outPtr;
2712+
return true;
2713+
}
2714+
}
2715+
2716+
private final void _loadMoreForChunkIfNeeded() throws IOException
2717+
{
2718+
if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk
2719+
loadMoreGuaranteed();
2720+
int end = _inputPtr + _chunkLeft;
2721+
if (end <= _inputEnd) { // all within buffer
2722+
_chunkLeft = 0;
2723+
_chunkEnd = end;
2724+
} else { // stretches beyond
2725+
_chunkLeft = (end - _inputEnd);
2726+
_chunkEnd = _inputEnd;
2727+
}
2728+
}
2729+
}
2730+
26082731
private final int _nextByte() throws IOException {
26092732
int inPtr = _inputPtr;
26102733
if (inPtr < _inputEnd) {
@@ -3716,6 +3839,10 @@ protected final boolean _tryToLoadToHaveAtLeast(int minAvailable) throws IOExcep
37163839
if (_inputStream == null) {
37173840
return false;
37183841
}
3842+
// The code below assumes this is true, so we check it here.
3843+
if (_inputBuffer.length < minAvailable) {
3844+
return false;
3845+
}
37193846
// Need to move remaining data in front?
37203847
int amount = _inputEnd - _inputPtr;
37213848
if (amount > 0 && _inputPtr > 0) {

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,10 @@ protected static String generateUnicodeString(int length) {
216216
return generateUnicodeString(length, new Random(length));
217217
}
218218

219+
protected static String generateUnicodeStringWithAsciiPrefix(int asciiPrefixLen, int length) {
220+
return generateUnicodeStringWithAsciiPrefix(asciiPrefixLen, length, new Random(length));
221+
}
222+
219223
protected static String generateUnicodeString(int length, Random rnd)
220224
{
221225
StringBuilder sw = new StringBuilder(length+10);
@@ -241,6 +245,31 @@ protected static String generateUnicodeString(int length, Random rnd)
241245
return sw.toString();
242246
}
243247

248+
protected static String generateUnicodeStringWithAsciiPrefix(int asciiLength, int length, Random rnd)
249+
{
250+
StringBuilder sw = new StringBuilder(length+10);
251+
// add a prefix of ascii chars
252+
int num = asciiLength;
253+
while (--num >= 0) {
254+
sw.append((char) ('A' + (num % 32)));
255+
}
256+
do {
257+
// Then a unicode char of 2, 3 or 4 bytes long
258+
switch (rnd.nextInt() % 3) {
259+
case 0:
260+
sw.append((char) (256 + rnd.nextInt() & 511));
261+
break;
262+
case 1:
263+
sw.append((char) (2048 + rnd.nextInt() & 4095));
264+
break;
265+
default:
266+
sw.append((char) (65536 + rnd.nextInt() & 0x3FFF));
267+
break;
268+
}
269+
} while (sw.length() < length);
270+
return sw.toString();
271+
}
272+
244273
protected static String generateLongAsciiString(int length) {
245274
return generateLongAsciiString(length, new Random(length));
246275
}

0 commit comments

Comments
 (0)