Skip to content

Commit 6d4c667

Browse files
committed
Improve string parsing efficiency and add length cap
1 parent 7b1acf1 commit 6d4c667

File tree

1 file changed

+65
-116
lines changed

1 file changed

+65
-116
lines changed

formatter/generic/genericformatter.cpp

Lines changed: 65 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ enum ItemType
1818
StatementSeparator,
1919
StringComponent,
2020
StringSeparator,
21+
StringSpace,
2122
FormatSpecifier,
2223
EscapeSequence,
2324
Group,
@@ -254,127 +255,75 @@ static vector<Item> CreateStatementItems(const vector<Item>& items)
254255
return result;
255256
}
256257

257-
static vector<InstructionTextToken> SeparateStringTokens(
258-
const InstructionTextToken& unprocessedStringToken
259-
)
258+
static vector<InstructionTextToken> ParseStringToken(
259+
const InstructionTextToken& unprocessedStringToken,
260+
const size_t maxParsingLength)
260261
{
261-
// Takes a StringToken and breaks it into sub-StringTokens along boundaries of punctuation
262-
// and spaces
263-
//
264-
// Ex.
265-
// "this.that" -> {"this", ".", "that"}
266-
// "format%llxsomething" -> {"format", "%llx", "something"}
267-
// "meep\n"moop" -> {"meep", "\n", "moop"}
268-
269-
vector<InstructionTextToken> result;
270-
string current;
271-
size_t i = 0;
272-
while (i < unprocessedStringToken.text.size())
273-
{
274-
char c = unprocessedStringToken.text[i];
275-
276-
// Handle format specifiers
277-
if (c == '%')
278-
{
279-
if (!current.empty())
280-
{
281-
result.push_back(InstructionTextToken(StringToken, current));
282-
current.clear();
283-
}
262+
const auto& src = unprocessedStringToken.text;
263+
const size_t tail = src.size();
284264

285-
string format = "%";
286-
i++;
287-
while (i < unprocessedStringToken.text.size())
288-
{
289-
c = unprocessedStringToken.text[i];
290-
if (!isalnum(c) && c != '.' && c != '-')
291-
break;
292-
format += c;
293-
i++;
294-
}
295-
result.push_back(InstructionTextToken(StringToken, format));
296-
continue;
297-
}
265+
// Max parsing length set for performance reasons, increase at your own peril!
266+
if (tail > maxParsingLength)
267+
return { unprocessedStringToken };
298268

299-
// Handle escape sequences
300-
if (c == '\\')
301-
{
302-
if (!current.empty())
303-
{
304-
result.push_back(InstructionTextToken(StringToken, current));
305-
current.clear();
306-
}
269+
vector<InstructionTextToken> result;
270+
size_t curStart = 0, curEnd = 0;
271+
auto ConstructToken = [&](size_t start, size_t end) {
272+
result.emplace_back(StringToken, string(src.substr(start, end - start)));
273+
};
307274

308-
string escape = "\\";
309-
if (i + 1 < unprocessedStringToken.text.size())
310-
{
311-
escape += unprocessedStringToken.text[i + 1];
312-
i += 2;
313-
}
314-
else
315-
i++;
316-
result.push_back(InstructionTextToken(StringToken, escape));
317-
continue;
318-
}
319-
320-
// Handle punctuation and spaces
321-
if (c == ',' || c == '.' || c == ':' || c == ';')
322-
{
323-
if (!current.empty())
324-
{
325-
result.push_back(InstructionTextToken(StringToken, current));
326-
current.clear();
327-
}
328-
329-
string repeated;
330-
repeated += c;
331-
while (i + 1 < unprocessedStringToken.text.size())
332-
{
333-
char next = unprocessedStringToken.text[i + 1];
334-
if (next == ',' || next == '.' || next == ':' || next == ';')
335-
{
336-
repeated += next;
337-
i++;
338-
}
339-
else
340-
break;
341-
}
342-
result.push_back(InstructionTextToken(StringToken, repeated));
343-
}
344-
else if (isspace(c))
345-
{
346-
if (!current.empty())
347-
{
348-
result.push_back(InstructionTextToken(StringToken, current));
349-
current.clear();
350-
}
351-
352-
string repeated;
353-
repeated += c;
354-
while (i + 1 < unprocessedStringToken.text.size())
355-
{
356-
char next = unprocessedStringToken.text[i + 1];
357-
if (isspace(next))
358-
{
359-
repeated += next;
360-
i++;
361-
}
362-
else
363-
break;
364-
}
365-
result.push_back(InstructionTextToken(StringToken, repeated));
366-
}
367-
else
368-
{
369-
current += c;
370-
}
371-
i++;
372-
}
275+
while (curEnd < tail)
276+
{
277+
char c = src[curEnd];
278+
279+
if (c == '%')
280+
{
281+
// Flush before format specifier
282+
if (curStart < curEnd)
283+
ConstructToken(curStart, curEnd);
284+
285+
size_t start = curEnd;
286+
curEnd++;
287+
while (curEnd < tail && (isalnum(src[curEnd]) || src[curEnd]=='.' || src[curEnd]=='-'))
288+
curEnd++;
289+
ConstructToken(start, curEnd);
290+
curStart = curEnd;
291+
}
292+
else if (c == '\\')
293+
{
294+
// Flush before escape sequence
295+
if (curStart < curEnd)
296+
ConstructToken(curStart, curEnd);
297+
298+
size_t start = curEnd;
299+
curEnd++; // consume '\'
300+
if (curEnd < tail)
301+
curEnd++; // consume escaped char
302+
ConstructToken(start, curEnd);
303+
curStart = curEnd;
304+
}
305+
else if (c == ',' || c == '.' || c == ':' || c == ';' || isspace(c))
306+
{
307+
// Flush before punctuation
308+
if (curStart < curEnd)
309+
ConstructToken(curStart, curEnd);
310+
// Group together repeated punctuation
311+
size_t start = curEnd;
312+
while (curEnd < tail && src[curEnd] == c)
313+
curEnd++;
314+
ConstructToken(start, curEnd);
315+
curStart = curEnd;
316+
}
317+
else
318+
{
319+
curEnd++;
320+
}
321+
}
373322

374-
if (!current.empty())
375-
result.push_back(InstructionTextToken(StringToken, current));
323+
if (curStart < curEnd)
324+
ConstructToken(curStart, curEnd);
376325

377-
return result;
326+
return result;
378327
}
379328

380329
static vector<Item> CreateStringGroups(const vector<Item>& items)
@@ -846,7 +795,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
846795
break;
847796
case StringToken:
848797
{
849-
vector<InstructionTextToken> stringTokens = SeparateStringTokens(token);
798+
vector<InstructionTextToken> stringTokens = ParseStringToken(token, 512);
850799
for (size_t k = 0; k < stringTokens.size(); k++)
851800
{
852801
InstructionTextToken subToken = stringTokens[k];

0 commit comments

Comments
 (0)