@@ -18,6 +18,7 @@ enum ItemType
1818 StatementSeparator,
1919 StringComponent,
2020 StringSeparator,
21+ StringSpace,
2122 FormatSpecifier,
2223 EscapeSequence,
2324 Group,
@@ -254,127 +255,75 @@ static vector<Item> CreateStatementItems(const vector<Item>& items)
254255 return result;
255256}
256257
257- static vector<InstructionTextToken> SeparateStringTokens (
258- const InstructionTextToken& unprocessedStringToken
259- )
258+ static vector<InstructionTextToken> ParseStringToken (
259+ const InstructionTextToken& unprocessedStringToken,
260+ const size_t maxParsingLength )
260261{
261- // Takes a StringToken and breaks it into sub-StringTokens along boundaries of punctuation
262- // and spaces
263- //
264- // Ex.
265- // "this.that" -> {"this", ".", "that"}
266- // "format%llxsomething" -> {"format", "%llx", "something"}
267- // "meep\n"moop" -> {"meep", "\n", "moop"}
268-
269- vector<InstructionTextToken> result;
270- string current;
271- size_t i = 0 ;
272- while (i < unprocessedStringToken.text .size ())
273- {
274- char c = unprocessedStringToken.text [i];
275-
276- // Handle format specifiers
277- if (c == ' %' )
278- {
279- if (!current.empty ())
280- {
281- result.push_back (InstructionTextToken (StringToken, current));
282- current.clear ();
283- }
262+ const auto & src = unprocessedStringToken.text ;
263+ const size_t tail = src.size ();
284264
285- string format = " %" ;
286- i++;
287- while (i < unprocessedStringToken.text .size ())
288- {
289- c = unprocessedStringToken.text [i];
290- if (!isalnum (c) && c != ' .' && c != ' -' )
291- break ;
292- format += c;
293- i++;
294- }
295- result.push_back (InstructionTextToken (StringToken, format));
296- continue ;
297- }
265+ // Max parsing length set for performance reasons, increase at your own peril!
266+ if (tail > maxParsingLength)
267+ return { unprocessedStringToken };
298268
299- // Handle escape sequences
300- if (c == ' \\ ' )
301- {
302- if (!current.empty ())
303- {
304- result.push_back (InstructionTextToken (StringToken, current));
305- current.clear ();
306- }
269+ vector<InstructionTextToken> result;
270+ size_t curStart = 0 , curEnd = 0 ;
271+ auto ConstructToken = [&](size_t start, size_t end) {
272+ result.emplace_back (StringToken, string (src.substr (start, end - start)));
273+ };
307274
308- string escape = " \\ " ;
309- if (i + 1 < unprocessedStringToken.text .size ())
310- {
311- escape += unprocessedStringToken.text [i + 1 ];
312- i += 2 ;
313- }
314- else
315- i++;
316- result.push_back (InstructionTextToken (StringToken, escape));
317- continue ;
318- }
319-
320- // Handle punctuation and spaces
321- if (c == ' ,' || c == ' .' || c == ' :' || c == ' ;' )
322- {
323- if (!current.empty ())
324- {
325- result.push_back (InstructionTextToken (StringToken, current));
326- current.clear ();
327- }
328-
329- string repeated;
330- repeated += c;
331- while (i + 1 < unprocessedStringToken.text .size ())
332- {
333- char next = unprocessedStringToken.text [i + 1 ];
334- if (next == ' ,' || next == ' .' || next == ' :' || next == ' ;' )
335- {
336- repeated += next;
337- i++;
338- }
339- else
340- break ;
341- }
342- result.push_back (InstructionTextToken (StringToken, repeated));
343- }
344- else if (isspace (c))
345- {
346- if (!current.empty ())
347- {
348- result.push_back (InstructionTextToken (StringToken, current));
349- current.clear ();
350- }
351-
352- string repeated;
353- repeated += c;
354- while (i + 1 < unprocessedStringToken.text .size ())
355- {
356- char next = unprocessedStringToken.text [i + 1 ];
357- if (isspace (next))
358- {
359- repeated += next;
360- i++;
361- }
362- else
363- break ;
364- }
365- result.push_back (InstructionTextToken (StringToken, repeated));
366- }
367- else
368- {
369- current += c;
370- }
371- i++;
372- }
275+ while (curEnd < tail)
276+ {
277+ char c = src[curEnd];
278+
279+ if (c == ' %' )
280+ {
281+ // Flush before format specifier
282+ if (curStart < curEnd)
283+ ConstructToken (curStart, curEnd);
284+
285+ size_t start = curEnd;
286+ curEnd++;
287+ while (curEnd < tail && (isalnum (src[curEnd]) || src[curEnd]==' .' || src[curEnd]==' -' ))
288+ curEnd++;
289+ ConstructToken (start, curEnd);
290+ curStart = curEnd;
291+ }
292+ else if (c == ' \\ ' )
293+ {
294+ // Flush before escape sequence
295+ if (curStart < curEnd)
296+ ConstructToken (curStart, curEnd);
297+
298+ size_t start = curEnd;
299+ curEnd++; // consume '\'
300+ if (curEnd < tail)
301+ curEnd++; // consume escaped char
302+ ConstructToken (start, curEnd);
303+ curStart = curEnd;
304+ }
305+ else if (c == ' ,' || c == ' .' || c == ' :' || c == ' ;' || isspace (c))
306+ {
307+ // Flush before punctuation
308+ if (curStart < curEnd)
309+ ConstructToken (curStart, curEnd);
310+ // Group together repeated punctuation
311+ size_t start = curEnd;
312+ while (curEnd < tail && src[curEnd] == c)
313+ curEnd++;
314+ ConstructToken (start, curEnd);
315+ curStart = curEnd;
316+ }
317+ else
318+ {
319+ curEnd++;
320+ }
321+ }
373322
374- if (!current. empty () )
375- result. push_back ( InstructionTextToken (StringToken, current) );
323+ if (curStart < curEnd )
324+ ConstructToken (curStart, curEnd );
376325
377- return result;
326+ return result;
378327}
379328
380329static vector<Item> CreateStringGroups (const vector<Item>& items)
@@ -846,7 +795,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
846795 break ;
847796 case StringToken:
848797 {
849- vector<InstructionTextToken> stringTokens = SeparateStringTokens (token);
798+ vector<InstructionTextToken> stringTokens = ParseStringToken (token, 512 );
850799 for (size_t k = 0 ; k < stringTokens.size (); k++)
851800 {
852801 InstructionTextToken subToken = stringTokens[k];
0 commit comments