Skip to content

Commit b159d05

Browse files
Rollforward TextFormatEscaper optimization with bug fixed.
PiperOrigin-RevId: 864892608
1 parent 135bec4 commit b159d05

File tree

3 files changed

+48
-8
lines changed

3 files changed

+48
-8
lines changed

java/core/src/main/java/com/google/protobuf/TextFormat.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2938,7 +2938,7 @@ public static class InvalidEscapeSequenceException extends IOException {
29382938
* it's weird.
29392939
*/
29402940
static String escapeText(final String input) {
2941-
return escapeBytes(ByteString.copyFromUtf8(input));
2941+
return TextFormatEscaper.escapeText(input);
29422942
}
29432943

29442944
/** Escape double quotes and backslashes in a String for emittingUnicode output of a message. */

java/core/src/main/java/com/google/protobuf/TextFormatEscaper.java

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,53 @@ static String escapeBytes(final ByteString input) {
8888
return escapeBytes(input.toByteArray());
8989
}
9090

91-
static boolean needsEscape(char c) {
92-
return c < 0x20 || c > 0x7e || c == '\'' || c == '"' || c == '\\';
93-
}
94-
9591
/** Like {@link #escapeBytes(ByteString)}, but escapes a text string. */
9692
static String escapeText(String input) {
97-
// Loop on the string to see if any character even needs escaping. If yes, then convert into
98-
// UTF-8 and then escape on those bytes. If not, we can just return the original input.
93+
boolean hasSingleQuote = false;
94+
boolean hasDoubleQuote = false;
95+
boolean hasBackslash = false;
96+
9997
for (int i = 0; i < input.length(); ++i) {
100-
if (needsEscape(input.charAt(i))) {
98+
char c = input.charAt(i);
99+
100+
// If there are any characters outside of ASCII range we eagerly convert to UTF and escape on
101+
// those bytes (including quotes as well). Note that escaping to UTF8 bytes instead of \\u
102+
// sequences is itself somewhat nonsensical, but JavaProto has behaved this way for a long
103+
// time, and changing the behavior would be disruptive.
104+
if (c < 0x20 || c > 0x7e) {
101105
return escapeBytes(input.getBytes(Internal.UTF_8));
102106
}
107+
108+
// While in this loop, keep track if there are any single quotes, double quotes, or
109+
// backslashes. This can help avoid multiple passes over the string looking for each of the
110+
// bad characters.
111+
switch (c) {
112+
case '\'':
113+
hasSingleQuote = true;
114+
break;
115+
case '"':
116+
hasDoubleQuote = true;
117+
break;
118+
case '\\':
119+
hasBackslash = true;
120+
break;
121+
default:
122+
break;
123+
}
103124
}
125+
126+
// Note: escape backslashes first. Order matters to avoid double-escaping the backslashes that
127+
// are added when escaping the quotes.
128+
if (hasBackslash) {
129+
input = input.replace("\\", "\\\\");
130+
}
131+
if (hasSingleQuote) {
132+
input = input.replace("\'", "\\\'");
133+
}
134+
if (hasDoubleQuote) {
135+
input = input.replace("\"", "\\\"");
136+
}
137+
104138
return input;
105139
}
106140

java/core/src/test/java/com/google/protobuf/TextFormatTest.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -988,6 +988,12 @@ public void testEscape() throws Exception {
988988
.isEqualTo(bytes("\0\001\007\b\f\n\r\t\013\\\'\""));
989989
assertThat(TextFormat.unescapeText("\\000\\001\\a\\b\\f\\n\\r\\t\\v\\\\\\'\\\""))
990990
.isEqualTo("\0\001\007\b\f\n\r\t\013\\\'\"");
991+
992+
String quotesAndBackslashOnly = "\\\"\'\\";
993+
assertThat(TextFormat.escapeText(quotesAndBackslashOnly)).isEqualTo("\\\\\\\"\\\'\\\\");
994+
assertThat(TextFormat.unescapeText(TextFormat.escapeText(quotesAndBackslashOnly)))
995+
.isEqualTo(quotesAndBackslashOnly);
996+
991997
assertThat(TextFormat.escapeText(ESCAPE_TEST_STRING)).isEqualTo(ESCAPE_TEST_STRING_ESCAPED);
992998
assertThat(TextFormat.unescapeText(ESCAPE_TEST_STRING_ESCAPED)).isEqualTo(ESCAPE_TEST_STRING);
993999

0 commit comments

Comments
 (0)