Fix wrapping whitespace and limit wrapping to annotation length

spoonmilk · spoonmilk · commit a2e3aa2c04a5 · 2025-06-16T16:44:41.000-04:00
diff --git a/binaryninjaapi.h b/binaryninjaapi.h
@@ -14100,6 +14100,7 @@ namespace BinaryNinja {
 		size_t desiredLineLength;
 		size_t minimumContentLength;
 		size_t tabWidth;
+		size_t maximumAnnotationLength;
 		std::string languageName;
 		std::string commentStartString;
 		std::string commentEndString;
diff --git a/binaryninjacore.h b/binaryninjacore.h
@@ -3618,6 +3618,7 @@ extern "C"
 		size_t desiredLineLength;
 		size_t minimumContentLength;
 		size_t tabWidth;
+		size_t maximumAnnotationLength;
 		char* languageName;
 		char* commentStartString;
 		char* commentEndString;
diff --git a/formatter/generic/genericformatter.cpp b/formatter/generic/genericformatter.cpp
@@ -18,7 +18,8 @@ enum ItemType
 	StatementSeparator,
 	StringComponent,
 	StringSeparator,
-	StringSpace,
+	StringWhitespace,
+	StringNewline,
 	FormatSpecifier,
 	EscapeSequence,
 	Group,
@@ -272,6 +273,14 @@ static vector<InstructionTextToken> ParseStringToken(
         result.emplace_back(StringToken, string(src.substr(start, end - start)));
     };
 
+	// We generally split along spaces while keeping words intact, but some cases have
+	// specific splitting behavior:
+	//
+	// - Any format specifier (starting with %) will be treated as an atom even if embedded
+	//   within a word
+	// - Any escape sequence will also be treated as an atom
+	// - We split along punctuation like commas, colons, periods, and semicolons, grouping
+	//   trailing punctuation together.
     while (curEnd < tail)
     {
         char c = src[curEnd];
@@ -332,8 +341,10 @@ static vector<Item> CreateStringGroups(const vector<Item>& items)
     bool hasStrings = false;
     for (auto& i : items)
     {
-		if (i.type == StringSeparator && !i.tokens.empty())
+		if ((i.type == StringSeparator) && !i.tokens.empty())
 		{
+			// We try to push separators onto a preceding word, otherwise treat as
+			// a singular atom
 			if (pending.empty())
 			{
 				result.push_back(Item {Atom, {}, {i.tokens}, 0});
@@ -347,6 +358,16 @@ static vector<Item> CreateStringGroups(const vector<Item>& items)
 			pending.clear();
 			hasStrings = true;
 		}
+    	else if (i.type == StringWhitespace)
+    	{
+    		// Special case because we let whitespace trail even if over width
+    		if (!pending.empty())
+    		{
+    			result.push_back(Item {StringComponent, pending, {}, 0});
+    			pending.clear();
+    		}
+    		result.push_back(Item {StringWhitespace, i.items, i.tokens, i.width});
+    	}
     	else if (i.type == FormatSpecifier || i.type == EscapeSequence)
     	{
     		if (!pending.empty())
@@ -795,17 +816,25 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
 				break;
 			case StringToken:
 			{
-				vector<InstructionTextToken> stringTokens = ParseStringToken(token, 512);
+				vector<InstructionTextToken> stringTokens = ParseStringToken(token, settings.maximumAnnotationLength);
 				for (size_t k = 0; k < stringTokens.size(); k++)
 				{
 					InstructionTextToken subToken = stringTokens[k];
 					string trimmedSubText = TrimString(subToken.text);
 					if (trimmedSubText.empty())
-						items.push_back(Item {StringSeparator, {}, {subToken}, 0});
+						items.push_back(Item {StringWhitespace, {}, {subToken}, 0});
 					if (trimmedSubText[0] == '%')
 						items.push_back(Item {FormatSpecifier, {}, {subToken}, 0});
 					else if (!trimmedSubText.empty() && trimmedSubText[0] == '\\')
+					{
+						if (trimmedSubText.size() > 1)
+						{
+							if (trimmedSubText[1] == 'n')
+								items.push_back(Item {StringNewline, {}, {subToken}, 0});
+							continue;
+						}
 						items.push_back(Item {EscapeSequence, {}, {subToken}, 0});
+					}
 					else if (trimmedSubText[0] == ',' || trimmedSubText[0] == '.' || trimmedSubText[0] == ':' || trimmedSubText[0] == ';')
 						items.push_back(Item {StringSeparator, {}, {subToken}, 0});
 					else
@@ -908,7 +937,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
 
 			for (auto item = items.begin(); item != items.end();)
 			{
-				if (currentWidth + item->width > desiredWidth)
+				if (currentWidth + item->width > desiredWidth && item->type != StringWhitespace)
 				{
 					// Current item is too wide to fit on the current line, will need to start a new line.
 					auto next = item;
diff --git a/lineformatter.cpp b/lineformatter.cpp
@@ -53,6 +53,7 @@ LineFormatterSettings LineFormatterSettings::FromAPIObject(const BNLineFormatter
 	result.desiredLineLength = settings->desiredLineLength;
 	result.minimumContentLength = settings->minimumContentLength;
 	result.tabWidth = settings->tabWidth;
+	result.maximumAnnotationLength = settings->maximumAnnotationLength;
 	result.languageName = settings->languageName;
 	result.commentStartString = settings->commentStartString;
 	result.commentEndString = settings->commentEndString;
@@ -69,6 +70,7 @@ BNLineFormatterSettings LineFormatterSettings::ToAPIObject() const
 	result.desiredLineLength = desiredLineLength;
 	result.minimumContentLength = minimumContentLength;
 	result.tabWidth = tabWidth;
+	result.maximumAnnotationLength = maximumAnnotationLength;
 	result.languageName = (char*)languageName.c_str();
 	result.commentStartString = (char*)commentStartString.c_str();
 	result.commentEndString = (char*)commentEndString.c_str();