Skip to content

Commit 1eba4dc

Browse files
derekmaurocopybara-github
authored andcommitted
Refactor absl::CUnescape() to use direct string output
instead of pointer/size. This fixes a UBSAN failure where an offset was being added to NULL and allows for additional memory safety checks present in hardened libc++/Abseil. The version of absl::CUnescape() that used pointer/size is no longer needed. PiperOrigin-RevId: 742751734 Change-Id: I2a209a13a236f719208d4462afbd7bbddff09566
1 parent c65fa83 commit 1eba4dc

File tree

2 files changed

+130
-138
lines changed

2 files changed

+130
-138
lines changed

absl/strings/escaping.cc

Lines changed: 111 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -76,49 +76,49 @@ inline bool IsSurrogate(char32_t c, absl::string_view src,
7676
//
7777
// Unescapes C escape sequences and is the reverse of CEscape().
7878
//
79-
// If 'source' is valid, stores the unescaped string and its size in
80-
// 'dest' and 'dest_len' respectively, and returns true. Otherwise
81-
// returns false and optionally stores the error description in
82-
// 'error'. Set 'error' to nullptr to disable error reporting.
79+
// If `src` is valid, stores the unescaped string `dst`, and returns
80+
// true. Otherwise returns false and optionally stores the error
81+
// description in `error`. Set `error` to nullptr to disable error
82+
// reporting.
8383
//
84-
// 'dest' should point to a buffer that is at least as big as 'source'.
85-
// 'source' and 'dest' may be the same.
86-
//
87-
// NOTE: any changes to this function must also be reflected in the older
88-
// UnescapeCEscapeSequences().
84+
// `src` and `dst` may use the same underlying buffer.
8985
// ----------------------------------------------------------------------
90-
bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
91-
absl::Nonnull<char*> dest,
92-
absl::Nonnull<ptrdiff_t*> dest_len,
86+
87+
bool CUnescapeInternal(absl::string_view src, bool leave_nulls_escaped,
88+
absl::Nonnull<std::string*> dst,
9389
absl::Nullable<std::string*> error) {
94-
char* d = dest;
95-
const char* p = source.data();
96-
const char* end = p + source.size();
97-
const char* last_byte = end - 1;
90+
strings_internal::STLStringResizeUninitialized(dst, src.size());
91+
92+
absl::string_view::size_type p = 0; // Current src position.
93+
std::string::size_type d = 0; // Current dst position.
9894

99-
// Small optimization for case where source = dest and there's no escaping
100-
while (p == d && p < end && *p != '\\') p++, d++;
95+
// When unescaping in-place, skip any prefix that does not have escaping.
96+
if (src.data() == dst->data()) {
97+
while (p < src.size() && src[p] != '\\') p++, d++;
98+
}
10199

102-
while (p < end) {
103-
if (*p != '\\') {
104-
*d++ = *p++;
100+
while (p < src.size()) {
101+
if (src[p] != '\\') {
102+
(*dst)[d++] = src[p++];
105103
} else {
106-
if (++p > last_byte) { // skip past the '\\'
107-
if (error) *error = "String cannot end with \\";
104+
if (++p >= src.size()) { // skip past the '\\'
105+
if (error != nullptr) {
106+
*error = "String cannot end with \\";
107+
}
108108
return false;
109109
}
110-
switch (*p) {
111-
case 'a': *d++ = '\a'; break;
112-
case 'b': *d++ = '\b'; break;
113-
case 'f': *d++ = '\f'; break;
114-
case 'n': *d++ = '\n'; break;
115-
case 'r': *d++ = '\r'; break;
116-
case 't': *d++ = '\t'; break;
117-
case 'v': *d++ = '\v'; break;
118-
case '\\': *d++ = '\\'; break;
119-
case '?': *d++ = '\?'; break; // \? Who knew?
120-
case '\'': *d++ = '\''; break;
121-
case '"': *d++ = '\"'; break;
110+
switch (src[p]) {
111+
case 'a': (*dst)[d++] = '\a'; break;
112+
case 'b': (*dst)[d++] = '\b'; break;
113+
case 'f': (*dst)[d++] = '\f'; break;
114+
case 'n': (*dst)[d++] = '\n'; break;
115+
case 'r': (*dst)[d++] = '\r'; break;
116+
case 't': (*dst)[d++] = '\t'; break;
117+
case 'v': (*dst)[d++] = '\v'; break;
118+
case '\\': (*dst)[d++] = '\\'; break;
119+
case '?': (*dst)[d++] = '\?'; break;
120+
case '\'': (*dst)[d++] = '\''; break;
121+
case '"': (*dst)[d++] = '\"'; break;
122122
case '0':
123123
case '1':
124124
case '2':
@@ -128,188 +128,170 @@ bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
128128
case '6':
129129
case '7': {
130130
// octal digit: 1 to 3 digits
131-
const char* octal_start = p;
132-
unsigned int ch = static_cast<unsigned int>(*p - '0'); // digit 1
133-
if (p < last_byte && is_octal_digit(p[1]))
134-
ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 2
135-
if (p < last_byte && is_octal_digit(p[1]))
136-
ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 3
131+
auto octal_start = p;
132+
unsigned int ch = static_cast<unsigned int>(src[p] - '0'); // digit 1
133+
if (p + 1 < src.size() && is_octal_digit(src[p + 1]))
134+
ch = ch * 8 + static_cast<unsigned int>(src[++p] - '0'); // digit 2
135+
if (p + 1 < src.size() && is_octal_digit(src[p + 1]))
136+
ch = ch * 8 + static_cast<unsigned int>(src[++p] - '0'); // digit 3
137137
if (ch > 0xff) {
138-
if (error) {
139-
*error = "Value of \\" +
140-
std::string(octal_start,
141-
static_cast<size_t>(p + 1 - octal_start)) +
142-
" exceeds 0xff";
138+
if (error != nullptr) {
139+
*error =
140+
"Value of \\" +
141+
std::string(src.substr(octal_start, p + 1 - octal_start)) +
142+
" exceeds 0xff";
143143
}
144144
return false;
145145
}
146146
if ((ch == 0) && leave_nulls_escaped) {
147147
// Copy the escape sequence for the null character
148-
const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
149-
*d++ = '\\';
150-
memmove(d, octal_start, octal_size);
151-
d += octal_size;
148+
(*dst)[d++] = '\\';
149+
while (octal_start <= p) {
150+
(*dst)[d++] = src[octal_start++];
151+
}
152152
break;
153153
}
154-
*d++ = static_cast<char>(ch);
154+
(*dst)[d++] = static_cast<char>(ch);
155155
break;
156156
}
157157
case 'x':
158158
case 'X': {
159-
if (p >= last_byte) {
160-
if (error) *error = "String cannot end with \\x";
159+
if (p + 1 >= src.size()) {
160+
if (error != nullptr) {
161+
*error = "String cannot end with \\x";
162+
}
161163
return false;
162-
} else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
163-
if (error) *error = "\\x cannot be followed by a non-hex digit";
164+
} else if (!absl::ascii_isxdigit(
165+
static_cast<unsigned char>(src[p + 1]))) {
166+
if (error != nullptr) {
167+
*error = "\\x cannot be followed by a non-hex digit";
168+
}
164169
return false;
165170
}
166171
unsigned int ch = 0;
167-
const char* hex_start = p;
168-
while (p < last_byte &&
169-
absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
172+
auto hex_start = p;
173+
while (p + 1 < src.size() &&
174+
absl::ascii_isxdigit(static_cast<unsigned char>(src[p + 1]))) {
170175
// Arbitrarily many hex digits
171-
ch = (ch << 4) + hex_digit_to_int(*++p);
176+
ch = (ch << 4) + hex_digit_to_int(src[++p]);
177+
}
172178
if (ch > 0xFF) {
173-
if (error) {
179+
if (error != nullptr) {
174180
*error = "Value of \\" +
175-
std::string(hex_start,
176-
static_cast<size_t>(p + 1 - hex_start)) +
181+
std::string(src.substr(hex_start, p + 1 - hex_start)) +
177182
" exceeds 0xff";
178183
}
179184
return false;
180185
}
181186
if ((ch == 0) && leave_nulls_escaped) {
182187
// Copy the escape sequence for the null character
183-
const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
184-
*d++ = '\\';
185-
memmove(d, hex_start, hex_size);
186-
d += hex_size;
188+
(*dst)[d++] = '\\';
189+
while (hex_start <= p) {
190+
(*dst)[d++] = src[hex_start++];
191+
}
187192
break;
188193
}
189-
*d++ = static_cast<char>(ch);
194+
(*dst)[d++] = static_cast<char>(ch);
190195
break;
191196
}
192197
case 'u': {
193198
// \uhhhh => convert 4 hex digits to UTF-8
194199
char32_t rune = 0;
195-
const char* hex_start = p;
196-
if (p + 4 >= end) {
197-
if (error) {
198-
*error = "\\u must be followed by 4 hex digits: \\" +
199-
std::string(hex_start,
200-
static_cast<size_t>(p + 1 - hex_start));
200+
auto hex_start = p;
201+
if (p + 4 >= src.size()) {
202+
if (error != nullptr) {
203+
*error = "\\u must be followed by 4 hex digits";
201204
}
202205
return false;
203206
}
204207
for (int i = 0; i < 4; ++i) {
205208
// Look one char ahead.
206-
if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
207-
rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
209+
if (absl::ascii_isxdigit(static_cast<unsigned char>(src[p + 1]))) {
210+
rune = (rune << 4) + hex_digit_to_int(src[++p]);
208211
} else {
209-
if (error) {
212+
if (error != nullptr) {
210213
*error = "\\u must be followed by 4 hex digits: \\" +
211-
std::string(hex_start,
212-
static_cast<size_t>(p + 1 - hex_start));
214+
std::string(src.substr(hex_start, p + 1 - hex_start));
213215
}
214216
return false;
215217
}
216218
}
217219
if ((rune == 0) && leave_nulls_escaped) {
218220
// Copy the escape sequence for the null character
219-
*d++ = '\\';
220-
memmove(d, hex_start, 5); // u0000
221-
d += 5;
221+
(*dst)[d++] = '\\';
222+
while (hex_start <= p) {
223+
(*dst)[d++] = src[hex_start++];
224+
}
222225
break;
223226
}
224-
if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
227+
if (IsSurrogate(rune, src.substr(hex_start, 5), error)) {
225228
return false;
226229
}
227-
d += strings_internal::EncodeUTF8Char(d, rune);
230+
d += strings_internal::EncodeUTF8Char(dst->data() + d, rune);
228231
break;
229232
}
230233
case 'U': {
231234
// \Uhhhhhhhh => convert 8 hex digits to UTF-8
232235
char32_t rune = 0;
233-
const char* hex_start = p;
234-
if (p + 8 >= end) {
235-
if (error) {
236-
*error = "\\U must be followed by 8 hex digits: \\" +
237-
std::string(hex_start,
238-
static_cast<size_t>(p + 1 - hex_start));
236+
auto hex_start = p;
237+
if (p + 8 >= src.size()) {
238+
if (error != nullptr) {
239+
*error = "\\U must be followed by 8 hex digits";
239240
}
240241
return false;
241242
}
242243
for (int i = 0; i < 8; ++i) {
243244
// Look one char ahead.
244-
if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
245+
if (absl::ascii_isxdigit(static_cast<unsigned char>(src[p + 1]))) {
245246
// Don't change rune until we're sure this
246247
// is within the Unicode limit, but do advance p.
247-
uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
248+
uint32_t newrune = (rune << 4) + hex_digit_to_int(src[++p]);
248249
if (newrune > 0x10FFFF) {
249-
if (error) {
250-
*error = "Value of \\" +
251-
std::string(hex_start,
252-
static_cast<size_t>(p + 1 - hex_start)) +
253-
" exceeds Unicode limit (0x10FFFF)";
250+
if (error != nullptr) {
251+
*error =
252+
"Value of \\" +
253+
std::string(src.substr(hex_start, p + 1 - hex_start)) +
254+
" exceeds Unicode limit (0x10FFFF)";
254255
}
255256
return false;
256257
} else {
257258
rune = newrune;
258259
}
259260
} else {
260-
if (error) {
261+
if (error != nullptr) {
261262
*error = "\\U must be followed by 8 hex digits: \\" +
262-
std::string(hex_start,
263-
static_cast<size_t>(p + 1 - hex_start));
263+
std::string(src.substr(hex_start, p + 1 - hex_start));
264264
}
265265
return false;
266266
}
267267
}
268268
if ((rune == 0) && leave_nulls_escaped) {
269269
// Copy the escape sequence for the null character
270-
*d++ = '\\';
271-
memmove(d, hex_start, 9); // U00000000
272-
d += 9;
270+
(*dst)[d++] = '\\';
271+
// U00000000
272+
while (hex_start <= p) {
273+
(*dst)[d++] = src[hex_start++];
274+
}
273275
break;
274276
}
275-
if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
277+
if (IsSurrogate(rune, src.substr(hex_start, 9), error)) {
276278
return false;
277279
}
278-
d += strings_internal::EncodeUTF8Char(d, rune);
280+
d += strings_internal::EncodeUTF8Char(dst->data() + d, rune);
279281
break;
280282
}
281283
default: {
282-
if (error) *error = std::string("Unknown escape sequence: \\") + *p;
284+
if (error != nullptr) {
285+
*error = std::string("Unknown escape sequence: \\") + src[p];
286+
}
283287
return false;
284288
}
285289
}
286-
p++; // read past letter we escaped
290+
p++; // Read past letter we escaped.
287291
}
288292
}
289-
*dest_len = d - dest;
290-
return true;
291-
}
292293

293-
// ----------------------------------------------------------------------
294-
// CUnescapeInternal()
295-
//
296-
// Same as above but uses a std::string for output. 'source' and 'dest'
297-
// may be the same.
298-
// ----------------------------------------------------------------------
299-
bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
300-
absl::Nonnull<std::string*> dest,
301-
absl::Nullable<std::string*> error) {
302-
strings_internal::STLStringResizeUninitialized(dest, source.size());
303-
304-
ptrdiff_t dest_size;
305-
if (!CUnescapeInternal(source,
306-
leave_nulls_escaped,
307-
&(*dest)[0],
308-
&dest_size,
309-
error)) {
310-
return false;
311-
}
312-
dest->erase(static_cast<size_t>(dest_size));
294+
dst->erase(d);
313295
return true;
314296
}
315297

absl/strings/escaping_test.cc

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -169,15 +169,25 @@ TEST(Unescape, BasicFunction) {
169169
EXPECT_TRUE(absl::CUnescape(val.escaped, &out));
170170
EXPECT_EQ(out, val.unescaped);
171171
}
172-
std::string bad[] = {"\\u1", // too short
173-
"\\U1", // too short
174-
"\\Uffffff", // exceeds 0x10ffff (largest Unicode)
175-
"\\U00110000", // exceeds 0x10ffff (largest Unicode)
176-
"\\uD835", // surrogate character (D800-DFFF)
177-
"\\U0000DD04", // surrogate character (D800-DFFF)
178-
"\\777", // exceeds 0xff
179-
"\\xABCD"}; // exceeds 0xff
180-
for (const std::string& e : bad) {
172+
constexpr absl::string_view bad[] = {
173+
"\\u1", // too short
174+
"\\U1", // too short
175+
"\\Uffffff", // exceeds 0x10ffff (largest Unicode)
176+
"\\U00110000", // exceeds 0x10ffff (largest Unicode)
177+
"\\uD835", // surrogate character (D800-DFFF)
178+
"\\U0000DD04", // surrogate character (D800-DFFF)
179+
"\\777", // exceeds 0xff
180+
"\\xABCD", // exceeds 0xff
181+
"endswith\\", // ends with "\"
182+
"endswith\\x", // ends with "\x"
183+
"endswith\\X", // ends with "\X"
184+
"\\x.2345678", // non-hex follows "\x"
185+
"\\X.2345678", // non-hex follows "\X"
186+
"\\u.2345678", // non-hex follows "\U"
187+
"\\U.2345678", // non-hex follows "\U"
188+
"\\.unknown", // unknown escape sequence
189+
};
190+
for (const auto e : bad) {
181191
std::string error;
182192
std::string out;
183193
EXPECT_FALSE(absl::CUnescape(e, &out, &error));

0 commit comments

Comments
 (0)