Skip to content

Commit 6fd68cd

Browse files
committed
Add VarHandle-based StringEncoder
1 parent 1e763b2 commit 6fd68cd

File tree

18 files changed

+977
-495
lines changed

18 files changed

+977
-495
lines changed

exporters/common/build.gradle.kts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,46 @@ plugins {
88
description = "OpenTelemetry Exporter Common"
99
otelJava.moduleName.set("io.opentelemetry.exporter.internal")
1010

11+
java {
12+
sourceSets {
13+
create("java9") {
14+
java {
15+
srcDir("src/main/java9")
16+
}
17+
// Make java9 source set depend on main source set
18+
// since VarHandleStringEncoder implements StringEncoder from the main source set
19+
compileClasspath += sourceSets.main.get().output + sourceSets.main.get().compileClasspath
20+
}
21+
}
22+
}
23+
24+
// Configure java9 compilation to see main source classes
25+
sourceSets.named("java9") {
26+
compileClasspath += sourceSets.main.get().output
27+
}
28+
29+
tasks.named<JavaCompile>("compileJava9Java") {
30+
options.release.set(9)
31+
}
32+
33+
tasks.named<Jar>("jar") {
34+
manifest {
35+
attributes["Multi-Release"] = "true"
36+
}
37+
from(sourceSets.named("java9").get().output) {
38+
into("META-INF/versions/9")
39+
}
40+
}
41+
42+
// Configure test to include java9 classes when running on Java 9+
43+
// so that StringEncoderHolder.createUnsafeEncoder() can instantiate the Java 9 version
44+
val javaVersion = JavaVersion.current()
45+
if (javaVersion >= JavaVersion.VERSION_1_9) {
46+
sourceSets.named("test") {
47+
runtimeClasspath += sourceSets.named("java9").get().output
48+
}
49+
}
50+
1151
val versions: Map<String, String> by project
1252
dependencies {
1353
api(project(":api:all"))
@@ -79,6 +119,14 @@ tasks {
79119
check {
80120
dependsOn(testing.suites)
81121
}
122+
123+
withType<Test> {
124+
// Allow VarHandle access to String internals
125+
// generally users won't do this and so won't get the VarHandle implementation
126+
// but the Java agent is able to automatically open these modules
127+
// (see ModuleOpener.java in that repository)
128+
jvmArgs("--add-opens=java.base/java.lang=ALL-UNNAMED")
129+
}
82130
}
83131

84132
afterEvaluate {
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry.exporter.internal.marshal;
7+
8+
import java.io.IOException;
9+
import javax.annotation.Nullable;
10+
11+
/**
12+
* This class contains shared logic for UTF-8 encoding operations while allowing subclasses to
13+
* implement different mechanisms for accessing String internal byte arrays (e.g., Unsafe vs
14+
* VarHandle).
15+
*
16+
* <p>This class is internal and is hence not for public use. Its APIs are unstable and can change
17+
* at any time.
18+
*/
19+
abstract class AbstractStringEncoder implements StringEncoder {
20+
21+
// Inner loop can process at most 8 * 255 bytes without overflowing counter. To process more bytes
22+
// inner loop has to be run multiple times.
23+
static final int MAX_INNER_LOOP_SIZE = 8 * 255;
24+
25+
// mask that selects only the most significant bit in every byte of the long
26+
static final long MOST_SIGNIFICANT_BIT_MASK = 0x8080808080808080L;
27+
28+
@Override
29+
public final int getUtf8Size(String string) {
30+
if (string.isEmpty()) {
31+
return 0;
32+
}
33+
34+
byte[] bytes = getStringBytes(string);
35+
if (bytes != null) {
36+
if (isLatin1(string)) {
37+
return string.length() + countNegative(bytes);
38+
} else {
39+
// UTF-16 case - fall back to standard calculation
40+
return string.getBytes(java.nio.charset.StandardCharsets.UTF_8).length;
41+
}
42+
}
43+
44+
// Fall back to standard calculation if we can't access internal bytes
45+
return string.getBytes(java.nio.charset.StandardCharsets.UTF_8).length;
46+
}
47+
48+
@Override
49+
public final void writeUtf8(CodedOutputStream output, String string, int utf8Length)
50+
throws IOException {
51+
if (string.isEmpty()) {
52+
return;
53+
}
54+
55+
byte[] bytes = getStringBytes(string);
56+
if (bytes != null && isLatin1(string)) {
57+
// Fast path for Latin-1 strings
58+
writeUtf8Latin1(output, bytes, string.length());
59+
} else {
60+
// Fall back to standard UTF-8 encoding
61+
byte[] utf8Bytes = string.getBytes(java.nio.charset.StandardCharsets.UTF_8);
62+
output.write(utf8Bytes, 0, utf8Bytes.length);
63+
}
64+
}
65+
66+
@Nullable
67+
protected abstract byte[] getStringBytes(String string);
68+
69+
protected abstract boolean isLatin1(String string);
70+
71+
protected abstract long getLong(byte[] bytes, int offset);
72+
73+
private void writeUtf8Latin1(CodedOutputStream output, byte[] bytes, int length)
74+
throws IOException {
75+
int offset = 0;
76+
77+
// Process 8 bytes at a time for performance
78+
for (int i = 1; i <= length / MAX_INNER_LOOP_SIZE + 1; i++) {
79+
int limit = Math.min((int) (i * MAX_INNER_LOOP_SIZE), length & ~7);
80+
for (; offset < limit; offset += 8) {
81+
long value = getLong(bytes, offset);
82+
long tmp = value & MOST_SIGNIFICANT_BIT_MASK;
83+
if (tmp != 0) {
84+
// Handle bytes with high bit set
85+
for (int j = 0; j < 8; j++) {
86+
int b = (int) ((value >>> (j * 8)) & 0xFF);
87+
if (b < 0) {
88+
// Convert negative byte to UTF-8 sequence
89+
output.write((byte) (0xC0 | ((b & 0xFF) >>> 6)));
90+
output.write((byte) (0x80 | (b & 0x3F)));
91+
} else {
92+
output.write((byte) b);
93+
}
94+
}
95+
} else {
96+
// All bytes are ASCII, write directly
97+
for (int j = 0; j < 8; j++) {
98+
output.write((byte) ((value >>> (j * 8)) & 0xFF));
99+
}
100+
}
101+
}
102+
}
103+
104+
// Process remaining bytes
105+
for (int i = offset; i < length; i++) {
106+
int b = bytes[i] & 0xFF;
107+
if (b >= 0x80) {
108+
// Convert to UTF-8 sequence
109+
output.write((byte) (0xC0 | (b >>> 6)));
110+
output.write((byte) (0x80 | (b & 0x3F)));
111+
} else {
112+
output.write((byte) b);
113+
}
114+
}
115+
}
116+
117+
/** Returns the count of bytes with negative value. */
118+
private static int countNegative(byte[] bytes) {
119+
int count = 0;
120+
int offset = 0;
121+
// We are processing one long (8 bytes) at a time. In the inner loop we are keeping counts in a
122+
// long where each byte in the long is a separate counter. Due to this the inner loop can
123+
// process a maximum of 8*255 bytes at a time without overflow.
124+
for (int i = 1; i <= bytes.length / MAX_INNER_LOOP_SIZE + 1; i++) {
125+
long tmp = 0; // each byte in this long is a separate counter
126+
int limit = Math.min(i * MAX_INNER_LOOP_SIZE, bytes.length & ~7);
127+
for (; offset < limit; offset += 8) {
128+
long value = UnsafeString.getLong(bytes, offset);
129+
// Mask the value keeping only the most significant bit in each byte and then shift this bit
130+
// to the position of the least significant bit in each byte. If the input byte was not
131+
// negative then after this transformation it will be zero, if it was negative then it will
132+
// be one.
133+
tmp += (value & MOST_SIGNIFICANT_BIT_MASK) >>> 7;
134+
}
135+
// sum up counts
136+
if (tmp != 0) {
137+
for (int j = 0; j < 8; j++) {
138+
count += (int) (tmp & 0xff);
139+
tmp = tmp >>> 8;
140+
}
141+
}
142+
}
143+
144+
// Handle remaining bytes. Previous loop processes 8 bytes a time, if the input size is not
145+
// divisible with 8 the remaining bytes are handled here.
146+
for (int i = offset; i < bytes.length; i++) {
147+
// same as if (bytes[i] < 0) count++;
148+
count += bytes[i] >>> 31;
149+
}
150+
return count;
151+
}
152+
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry.exporter.internal.marshal;
7+
8+
import java.io.IOException;
9+
10+
/**
11+
* Fallback StringEncoder implementation using standard Java string operations.
12+
*
13+
* <p>This implementation works on all Java versions and provides correct UTF-8 handling.
14+
*
15+
* <p>This class is internal and is hence not for public use. Its APIs are unstable and can change
16+
* at any time.
17+
*/
18+
final class FallbackStringEncoder implements StringEncoder {
19+
20+
FallbackStringEncoder() {}
21+
22+
@Override
23+
public int getUtf8Size(String string) {
24+
return encodedUtf8Length(string);
25+
}
26+
27+
@Override
28+
public void writeUtf8(CodedOutputStream output, String string, int utf8Length)
29+
throws IOException {
30+
encodeUtf8(output, string);
31+
}
32+
33+
// adapted from
34+
// https://github.com/protocolbuffers/protobuf/blob/b618f6750aed641a23d5f26fbbaf654668846d24/java/core/src/main/java/com/google/protobuf/Utf8.java#L217
35+
private static int encodedUtf8Length(String string) {
36+
// Warning to maintainers: this implementation is highly optimized.
37+
int utf16Length = string.length();
38+
int utf8Length = utf16Length;
39+
int i = 0;
40+
41+
// This loop optimizes for pure ASCII.
42+
while (i < utf16Length && string.charAt(i) < 0x80) {
43+
i++;
44+
}
45+
46+
// This loop optimizes for chars less than 0x800.
47+
for (; i < utf16Length; i++) {
48+
char c = string.charAt(i);
49+
if (c < 0x800) {
50+
utf8Length += ((0x7f - c) >>> 31); // branch free!
51+
} else {
52+
utf8Length += encodedUtf8LengthGeneral(string, i);
53+
break;
54+
}
55+
}
56+
57+
if (utf8Length < utf16Length) {
58+
// Necessary and sufficient condition for overflow because of maximum 3x expansion
59+
throw new IllegalArgumentException(
60+
"UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
61+
}
62+
63+
return utf8Length;
64+
}
65+
66+
// adapted from
67+
// https://github.com/protocolbuffers/protobuf/blob/b618f6750aed641a23d5f26fbbaf654668846d24/java/core/src/main/java/com/google/protobuf/Utf8.java#L247
68+
private static int encodedUtf8LengthGeneral(String string, int start) {
69+
int utf16Length = string.length();
70+
int utf8Length = 0;
71+
for (int i = start; i < utf16Length; i++) {
72+
char c = string.charAt(i);
73+
if (c < 0x800) {
74+
utf8Length += (0x7f - c) >>> 31; // branch free!
75+
} else {
76+
utf8Length += 2;
77+
if (Character.isSurrogate(c)) {
78+
// Check that we have a well-formed surrogate pair.
79+
if (Character.codePointAt(string, i) != c) {
80+
i++;
81+
} else {
82+
// invalid sequence
83+
// At this point we have accumulated 3 byes of length (2 in this method and 1 in caller)
84+
// for current character, reduce the length to 1 bytes as we are going to encode the
85+
// invalid character as ?
86+
utf8Length -= 2;
87+
}
88+
}
89+
}
90+
}
91+
92+
return utf8Length;
93+
}
94+
95+
// encode utf8 the same way as length is computed in encodedUtf8Length
96+
// adapted from
97+
// https://github.com/protocolbuffers/protobuf/blob/b618f6750aed641a23d5f26fbbaf654668846d24/java/core/src/main/java/com/google/protobuf/Utf8.java#L1016
98+
private static void encodeUtf8(CodedOutputStream output, String in) throws IOException {
99+
int utf16Length = in.length();
100+
int i = 0;
101+
// Designed to take advantage of
102+
// https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
103+
for (char c; i < utf16Length && (c = in.charAt(i)) < 0x80; i++) {
104+
output.write((byte) c);
105+
}
106+
if (i == utf16Length) {
107+
return;
108+
}
109+
110+
for (char c; i < utf16Length; i++) {
111+
c = in.charAt(i);
112+
if (c < 0x80) {
113+
// 1 byte, 7 bits
114+
output.write((byte) c);
115+
} else if (c < 0x800) { // 11 bits, two UTF-8 bytes
116+
output.write((byte) ((0xF << 6) | (c >>> 6)));
117+
output.write((byte) (0x80 | (0x3F & c)));
118+
} else if (!Character.isSurrogate(c)) {
119+
// Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
120+
output.write((byte) ((0xF << 5) | (c >>> 12)));
121+
output.write((byte) (0x80 | (0x3F & (c >>> 6))));
122+
output.write((byte) (0x80 | (0x3F & c)));
123+
} else {
124+
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
125+
// four UTF-8 bytes
126+
int codePoint = Character.codePointAt(in, i);
127+
if (codePoint != c) {
128+
output.write((byte) ((0xF << 4) | (codePoint >>> 18)));
129+
output.write((byte) (0x80 | (0x3F & (codePoint >>> 12))));
130+
output.write((byte) (0x80 | (0x3F & (codePoint >>> 6))));
131+
output.write((byte) (0x80 | (0x3F & codePoint)));
132+
i++;
133+
} else {
134+
// invalid sequence
135+
output.write((byte) '?');
136+
}
137+
}
138+
}
139+
}
140+
}

exporters/common/src/main/java/io/opentelemetry/exporter/internal/marshal/MarshalerContext.java

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
*/
2828
public final class MarshalerContext {
2929
private final boolean marshalStringNoAllocation;
30-
private final boolean marshalStringUnsafe;
30+
private final StringEncoder stringEncoder;
3131

3232
private int[] sizes = new int[16];
3333
private int sizeReadIndex;
@@ -37,20 +37,25 @@ public final class MarshalerContext {
3737
private int dataWriteIndex;
3838

3939
public MarshalerContext() {
40-
this(/* marshalStringNoAllocation= */ true, /* marshalStringUnsafe= */ true);
40+
this(/* marshalStringNoAllocation= */ true);
4141
}
4242

43-
public MarshalerContext(boolean marshalStringNoAllocation, boolean marshalStringUnsafe) {
43+
public MarshalerContext(boolean marshalStringNoAllocation) {
4444
this.marshalStringNoAllocation = marshalStringNoAllocation;
45-
this.marshalStringUnsafe = marshalStringUnsafe;
45+
this.stringEncoder = StringEncoder.getInstance();
46+
}
47+
48+
public MarshalerContext(boolean marshalStringNoAllocation, StringEncoder stringEncoder) {
49+
this.marshalStringNoAllocation = marshalStringNoAllocation;
50+
this.stringEncoder = stringEncoder;
4651
}
4752

4853
public boolean marshalStringNoAllocation() {
4954
return marshalStringNoAllocation;
5055
}
5156

52-
public boolean marshalStringUnsafe() {
53-
return marshalStringUnsafe;
57+
public StringEncoder getStringEncoder() {
58+
return stringEncoder;
5459
}
5560

5661
public void addSize(int size) {

0 commit comments

Comments
 (0)