Skip to content

Commit 3ada50f

Browse files
optimiz3rstam
authored andcommitted
Improve deserialization performance by an additional ~8% and improve memory churn
ReadCString is one of the top CPU consumers during deserialization because 2 O(N) passes to calculate string length plus an O(N) memory copy are performed. Performance can be significantly improved by special casing empty, single character, and the "_id" strings during deserialization. Doing so eliminates transient string memory allocations and is done using a single O(1) lookup. This significantly improves Dictionaries encoded as ArrayOfDocuments ({ k, v }) and general purpose users that are already following the MongoDB performance best practice of using single character field names. This following scenario is special cased in ReadCString: 1. The "_id" string This following scenarios are special cased in both ReadCString and ReadString: 1. Empty strings 2. Single character strings (by definition ASCII when dealing with UTF8) Other notes: 1. Fixed an off-by-one bug in ReadString in a call to EnsureDataAvailable. 2. Switched to a UTF8Encoding that will throw on invalid characters rather than corrupting the returned data.
1 parent 318d5ad commit 3ada50f

File tree

1 file changed

+112
-16
lines changed

1 file changed

+112
-16
lines changed

Bson/IO/BsonBuffer.cs

Lines changed: 112 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ public class BsonBuffer : IDisposable
3131
private static Stack<byte[]> __chunkPool = new Stack<byte[]>();
3232
private static int __maxChunkPoolSize = 64;
3333
private const int __chunkSize = 16 * 1024; // 16KiB
34+
private static readonly string[] __asciiStringTable = BuildAsciiStringTable();
35+
private static readonly UTF8Encoding __utf8Encoding = new UTF8Encoding(false, true); // throw on invalid bytes
3436
private static readonly bool[] __validBsonTypes = new bool[256];
3537

3638
// private fields
@@ -506,17 +508,17 @@ public string ReadString()
506508
{
507509
if (_disposed) { throw new ObjectDisposedException("BsonBuffer"); }
508510
var length = ReadInt32();
509-
EnsureDataAvailable(length + 1);
511+
EnsureDataAvailable(length);
510512
string value;
511513
if (__chunkSize - _chunkOffset >= length - 1)
512514
{
513-
value = Encoding.UTF8.GetString(_chunk, _chunkOffset, length - 1);
515+
value = ParseString(_chunk, _chunkOffset, length - 1);
514516
Position += length - 1;
515517
}
516518
else
517519
{
518520
// straddles chunk boundary
519-
value = Encoding.UTF8.GetString(ReadBytes(length - 1));
521+
value = __utf8Encoding.GetString(ReadBytes(length - 1));
520522
}
521523
byte terminator = ReadByte();
522524
if (terminator != 0)
@@ -543,11 +545,11 @@ public string ReadCString()
543545
{
544546
partialCount = _length - _position; // populated part of last chunk
545547
}
546-
var index = Array.IndexOf<byte>(_chunk, 0, _chunkOffset, partialCount);
547-
if (index != -1)
548+
549+
string value;
550+
var stringLength = TryParseCString(_chunk, _chunkOffset, partialCount, out value);
551+
if (stringLength >= 0)
548552
{
549-
var stringLength = index - _chunkOffset;
550-
var value = Encoding.UTF8.GetString(_chunk, _chunkOffset, stringLength);
551553
Position += stringLength + 1;
552554
return value;
553555
}
@@ -566,12 +568,12 @@ public string ReadCString()
566568
{
567569
partialCount = _length - localPosition; // populated part of last chunk
568570
}
569-
index = Array.IndexOf<byte>(localChunk, 0, 0, partialCount);
571+
var index = Array.IndexOf<byte>(localChunk, 0, 0, partialCount);
570572
if (index != -1)
571573
{
572574
localPosition += index;
573-
var stringLength = localPosition - _position;
574-
var value = Encoding.UTF8.GetString(ReadBytes(stringLength)); // ReadBytes advances over string
575+
stringLength = localPosition - _position;
576+
value = __utf8Encoding.GetString(ReadBytes(stringLength)); // ReadBytes advances over string
575577
Position += 1; // skip over null byte at end
576578
return value;
577579
}
@@ -711,18 +713,18 @@ public void WriteBytes(byte[] value)
711713
public void WriteCString(string value)
712714
{
713715
if (_disposed) { throw new ObjectDisposedException("BsonBuffer"); }
714-
int maxLength = Encoding.UTF8.GetMaxByteCount(value.Length) + 1;
716+
int maxLength = __utf8Encoding.GetMaxByteCount(value.Length) + 1;
715717
EnsureSpaceAvailable(maxLength);
716718
if (__chunkSize - _chunkOffset >= maxLength)
717719
{
718-
int length = Encoding.UTF8.GetBytes(value, 0, value.Length, _chunk, _chunkOffset);
720+
int length = __utf8Encoding.GetBytes(value, 0, value.Length, _chunk, _chunkOffset);
719721
_chunk[_chunkOffset + length] = 0;
720722
Position += length + 1;
721723
}
722724
else
723725
{
724726
// straddles chunk boundary
725-
byte[] bytes = Encoding.UTF8.GetBytes(value);
727+
byte[] bytes = __utf8Encoding.GetBytes(value);
726728
WriteBytes(bytes);
727729
WriteByte(0);
728730
}
@@ -829,11 +831,11 @@ public void WriteObjectId(int timestamp, int machine, short pid, int increment)
829831
public void WriteString(string value)
830832
{
831833
if (_disposed) { throw new ObjectDisposedException("BsonBuffer"); }
832-
int maxLength = Encoding.UTF8.GetMaxByteCount(value.Length) + 5;
834+
int maxLength = __utf8Encoding.GetMaxByteCount(value.Length) + 5;
833835
EnsureSpaceAvailable(maxLength);
834836
if (__chunkSize - _chunkOffset >= maxLength)
835837
{
836-
int length = Encoding.UTF8.GetBytes(value, 0, value.Length, _chunk, _chunkOffset + 4); // write string first
838+
int length = __utf8Encoding.GetBytes(value, 0, value.Length, _chunk, _chunkOffset + 4); // write string first
837839
int lengthPlusOne = length + 1;
838840
_chunk[_chunkOffset + 0] = (byte)(lengthPlusOne); // now we know the length
839841
_chunk[_chunkOffset + 1] = (byte)(lengthPlusOne >> 8);
@@ -845,7 +847,7 @@ public void WriteString(string value)
845847
else
846848
{
847849
// straddles chunk boundary
848-
byte[] bytes = Encoding.UTF8.GetBytes(value);
850+
byte[] bytes = __utf8Encoding.GetBytes(value);
849851
WriteInt32(bytes.Length + 1);
850852
WriteBytes(bytes);
851853
WriteByte(0);
@@ -897,6 +899,100 @@ public void WriteZero()
897899
}
898900
}
899901

902+
// private static methods
903+
private static string[] BuildAsciiStringTable()
904+
{
905+
var asciiStringTable = new string[128];
906+
907+
for (int i = 0; i < 128; ++i)
908+
{
909+
asciiStringTable[i] = new string((char)i, 1);
910+
}
911+
912+
return asciiStringTable;
913+
}
914+
915+
private static string ParseString(byte[] buffer, int startIndex, int stringLength)
916+
{
917+
switch (stringLength)
918+
{
919+
// special case empty strings
920+
case 0:
921+
return string.Empty;
922+
923+
// special case single character strings
924+
case 1:
925+
var c = buffer[startIndex];
926+
if (c >= 128)
927+
{
928+
// multiple bytes required
929+
throw new DecoderFallbackException("[" + c.ToString("X2") + "] is an invalid character");
930+
}
931+
return __asciiStringTable[c];
932+
}
933+
934+
return __utf8Encoding.GetString(buffer, startIndex, stringLength);
935+
}
936+
937+
/// <returns>The number of bytes parsed excluding the null terminator; -1 otherwise.</returns>
938+
private static int TryParseCString(byte[] buffer, int startIndex, int length, out string value)
939+
{
940+
if (length < 1)
941+
{
942+
value = null;
943+
return -1;
944+
}
945+
946+
// special case empty strings
947+
var c1 = buffer[startIndex];
948+
if (c1 == 0)
949+
{
950+
value = string.Empty;
951+
return 0;
952+
}
953+
954+
if (length < 2)
955+
{
956+
value = null;
957+
return -1;
958+
}
959+
960+
// special case single character strings
961+
var c2 = buffer[startIndex + 1];
962+
if (c2 == 0)
963+
{
964+
if (c1 >= 128)
965+
{
966+
// multiple bytes required
967+
throw new DecoderFallbackException("[" + c1.ToString("X2") + "] is an invalid character");
968+
}
969+
value = __asciiStringTable[c1];
970+
return 1;
971+
}
972+
973+
// special case the _id string
974+
if (length >= 4 &&
975+
c1 == 0x5f && // '_'
976+
c2 == 0x69 && // 'i'
977+
buffer[startIndex + 2] == 0x64 && // 'd'
978+
buffer[startIndex + 3] == 0) // '/0'
979+
{
980+
value = "_id";
981+
return 3;
982+
}
983+
984+
var index = Array.IndexOf<byte>(buffer, 0, startIndex + 2, length - 2);
985+
if (index != -1)
986+
{
987+
var stringLength = index - startIndex;
988+
value = __utf8Encoding.GetString(buffer, startIndex, stringLength);
989+
return stringLength;
990+
}
991+
992+
value = null;
993+
return -1;
994+
}
995+
900996
// private methods
901997
private void EnsureDataAvailable(int needed)
902998
{

0 commit comments

Comments
 (0)