@@ -16,6 +16,11 @@ enum ItemType
1616 ArgumentSeparator,
1717 Statement,
1818 StatementSeparator,
19+ StringComponent,
20+ StringSeparator,
21+ StringWhitespace,
22+ FormatSpecifier,
23+ EscapeSequence,
1924 Group,
2025 Container,
2126 StartOfContainer,
@@ -250,6 +255,161 @@ static vector<Item> CreateStatementItems(const vector<Item>& items)
250255 return result;
251256}
252257
258+ static vector<InstructionTextToken> ParseStringToken (
259+ const InstructionTextToken& unprocessedStringToken,
260+ const size_t maxParsingLength)
261+ {
262+ const auto & src = unprocessedStringToken.text ;
263+ const size_t tail = src.size ();
264+
265+ // Max parsing length set to max annotation length
266+ if (tail > maxParsingLength)
267+ return { unprocessedStringToken };
268+ vector<InstructionTextToken> result;
269+ size_t curStart = 0 , curEnd = 0 ;
270+
271+ auto ConstructToken = [&](size_t start, size_t end) {
272+ InstructionTextToken token = unprocessedStringToken;
273+ const string newTxt = string (src.substr (start, end - start));
274+ token.text = newTxt;
275+ token.width = newTxt.size ();
276+ result.emplace_back (token);
277+ };
278+
279+ auto flushToken = [&](size_t start, size_t end)
280+ {
281+ if (start < end)
282+ ConstructToken (start, end);
283+ };
284+
285+ // We generally split along spaces while keeping words intact, but some cases have
286+ // specific splitting behavior:
287+ //
288+ // - Any format specifier (starting with %) will be treated as an atom even if embedded
289+ // within a word
290+ // - Any escape sequence will also be treated as an atom
291+ // - We split along punctuation like commas, colons, periods, and semicolons, grouping
292+ // trailing punctuation together.
293+ while (curEnd < tail)
294+ {
295+ char c = src[curEnd];
296+
297+ if (c == ' %' )
298+ {
299+ // Flush before format specifier
300+ flushToken (curStart, curEnd);
301+
302+ size_t start = curEnd;
303+ curEnd++;
304+ while (curEnd < tail && (isalnum (src[curEnd]) || src[curEnd]==' .' || src[curEnd]==' -' ))
305+ curEnd++;
306+ ConstructToken (start, curEnd);
307+ curStart = curEnd;
308+ }
309+ else if (c == ' \\ ' )
310+ {
311+ // Flush before escape sequence
312+ flushToken (curStart, curEnd);
313+
314+ size_t start = curEnd;
315+ curEnd++; // consume '\'
316+ if (curEnd < tail)
317+ curEnd++; // consume escaped char
318+ ConstructToken (start, curEnd);
319+ curStart = curEnd;
320+ }
321+ else if (c == ' ,' || c == ' .' || c == ' :' || c == ' ;' || isspace (c))
322+ {
323+ // Flush before punctuation
324+ flushToken (curStart, curEnd);
325+
326+ // Group together repeated punctuation
327+ size_t start = curEnd;
328+ while (curEnd < tail && src[curEnd] == c)
329+ curEnd++;
330+ ConstructToken (start, curEnd);
331+ curStart = curEnd;
332+ }
333+ else
334+ {
335+ curEnd++;
336+ }
337+ }
338+
339+ flushToken (curStart, curEnd);
340+ return result;
341+ }
342+
343+ static vector<Item> CreateStringGroups (const vector<Item>& items)
344+ {
345+ vector<Item> result, pending;
346+ bool hasStrings = false ;
347+ for (auto & i : items)
348+ {
349+ if (i.type == StringSeparator && !i.tokens .empty ())
350+ {
351+ // We try to push separators onto a preceding word, otherwise treat as
352+ // a singular atom
353+ if (pending.empty ())
354+ {
355+ result.push_back (Item {Atom, {}, {i.tokens }, 0 });
356+ }
357+ else
358+ {
359+ for (auto & j : i.tokens )
360+ pending.back ().AddTokenToLastAtom (j);
361+ result.push_back (Item {StringComponent, pending, {}, 0 });
362+ }
363+ pending.clear ();
364+ hasStrings = true ;
365+ }
366+ else if (i.type == StringWhitespace)
367+ {
368+ // Special case because we let whitespace trail even if over width
369+ if (!pending.empty ())
370+ {
371+ result.push_back (Item {StringComponent, pending, {}, 0 });
372+ pending.clear ();
373+ }
374+ result.push_back (Item {StringWhitespace, i.items , i.tokens , i.width });
375+ }
376+ else if (i.type == FormatSpecifier || i.type == EscapeSequence)
377+ {
378+ // Flush previous tokens before special sequences like format specifiers or
379+ // escape sequences
380+ if (!pending.empty ())
381+ {
382+ result.push_back (Item {StringComponent, pending, {}, 0 });
383+ pending.clear ();
384+ }
385+ result.push_back (Item { Atom, i.items , i.tokens , i.width });
386+ }
387+
388+ else if (i.type == StartOfContainer && pending.empty ())
389+ {
390+ result.push_back (i);
391+ }
392+ else if (i.type == EndOfContainer && hasStrings && !pending.empty ())
393+ {
394+ result.push_back (Item {StringComponent, pending, {}, 0 });
395+ result.push_back (i);
396+ }
397+ else
398+ {
399+ pending.push_back (Item {i.type , CreateStringGroups (i.items ), i.tokens , 0 });
400+ }
401+ }
402+
403+ if (!pending.empty ())
404+ {
405+ if (hasStrings)
406+ result.push_back (Item {StringComponent, pending, {}, 0 });
407+ else
408+ result.insert (result.end (), pending.begin (), pending.end ());
409+ }
410+
411+ return result;
412+ }
253413
254414static vector<Item> CreateAssignmentOperatorGroups (const vector<Item>& items)
255415{
@@ -576,8 +736,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
576736 size_t tokenIndex = indentationTokens.size ();
577737
578738 // First break the line down into nested container items. A container is anything between a pair of
579- // BraceTokens (except for strings, where the entire string, including the quotes, are treated as
580- // a single atom).
739+ // BraceTokens
581740 vector<Item> items;
582741 stack<vector<Item>> itemStack;
583742 for (; tokenIndex < currentLine.tokens .size (); tokenIndex++)
@@ -588,29 +747,30 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
588747 switch (token.type )
589748 {
590749 case BraceToken:
750+ // Beginning of string
591751 if (tokenIndex + 1 < currentLine.tokens .size ()
592752 && currentLine.tokens [tokenIndex + 1 ].type == StringToken)
593753 {
594- // Treat string tokens surrounded by brace tokens as a unit (this is usually the quotes
595- // surrounding the string)
596- Item atom;
597- atom.type = Atom;
598- atom.tokens .push_back (token);
599- atom.tokens .push_back (currentLine.tokens [tokenIndex + 1 ]);
600- atom.width = 0 ;
601- tokenIndex++;
602- if (tokenIndex + 1 < currentLine.tokens .size ()
603- && currentLine.tokens [tokenIndex + 1 ].type == BraceToken)
604- {
605- atom.tokens .push_back (currentLine.tokens [tokenIndex + 1 ]);
606- tokenIndex++;
607- }
754+ // Create a ContainerContents item and place it onto the item stack. This will hold anything
755+ // inside the container once the end of the container is found.
756+ items.push_back (Item {Container, {}, {}, 0 });
757+ itemStack.push (items);
608758
609- items.push_back (atom);
610- break ;
759+ // Starting a new context
760+ items.clear ();
761+ items.push_back (Item {StartOfContainer, {}, {token}, 0 });
611762 }
612-
613- if (trimmedText == " (" || trimmedText == " [" || trimmedText == " {" )
763+ // End of string
764+ else if (currentLine.tokens [tokenIndex].type == StringToken
765+ && tokenIndex + 1 < currentLine.tokens .size ()
766+ && currentLine.tokens [tokenIndex + 1 ].type == BraceToken)
767+ {
768+ // Create a ContainerContents item and place it onto the item stack. This will hold anything
769+ // inside the container once the end of the container is found.
770+ items.push_back (Item {Container, {}, {}, 0 });
771+ itemStack.push (items);
772+ }
773+ else if (trimmedText == " (" || trimmedText == " [" || trimmedText == " {" )
614774 {
615775 // Create a ContainerContents item and place it onto the item stack. This will hold anything
616776 // inside the container once the end of the container is found.
@@ -663,6 +823,25 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
663823 else
664824 items.push_back (Item {Operator, {}, {token}, 0 });
665825 break ;
826+ case StringToken:
827+ {
828+ vector<InstructionTextToken> stringTokens = ParseStringToken (token, settings.maximumAnnotationLength );
829+ for (auto subToken : stringTokens)
830+ {
831+ string trimmedSubText = TrimString (subToken.text );
832+ if (trimmedSubText.empty ())
833+ items.push_back (Item {StringWhitespace, {}, {subToken}, 0 });
834+ if (trimmedSubText[0 ] == ' %' )
835+ items.push_back (Item {FormatSpecifier, {}, {subToken}, 0 });
836+ else if (!trimmedSubText.empty () && trimmedSubText[0 ] == ' \\ ' )
837+ items.push_back (Item {EscapeSequence, {}, {subToken}, 0 });
838+ else if (trimmedSubText[0 ] == ' ,' || trimmedSubText[0 ] == ' .' || trimmedSubText[0 ] == ' :' || trimmedSubText[0 ] == ' ;' )
839+ items.push_back (Item {StringSeparator, {}, {subToken}, 0 });
840+ else
841+ items.push_back (Item {Atom, {}, {subToken}, 0 });
842+ }
843+ break ;
844+ }
666845 default :
667846 items.push_back (Item {Atom, {}, {token}, 0 });
668847 break ;
@@ -699,6 +878,10 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
699878 // the previous atom.
700879 items = RelocateStartAndEndOfContainerItems (items);
701880
881+ // Create internal groupings for displaying strings -- grouping items by punctuation, format specifiers, and
882+ // escape sequences
883+ items = CreateStringGroups (items);
884+
702885 // Now that items are done, compute widths for layout
703886 for (auto & j : items)
704887 j.CalculateWidth ();
@@ -754,9 +937,16 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
754937
755938 for (auto item = items.begin (); item != items.end ();)
756939 {
757- if (currentWidth + item->width > desiredWidth)
940+ if (item->type == StringComponent && currentWidth + item->width > desiredWidth)
941+ {
942+ // If a string is too wide to fit on the current line, create a newline
943+ // without additional indentation
944+ newLine ();
945+ }
946+ else if (currentWidth + item->width > desiredWidth && item->type != StringWhitespace)
758947 {
759948 // Current item is too wide to fit on the current line, will need to start a new line.
949+ // Whitespace is allowed to be too wide; we push it on as the preceding word is wrapped.
760950 auto next = item;
761951 ++next;
762952
0 commit comments