Improve string parsing efficiency and add length cap

spoonmilk · spoonmilk · commit 6d4c667cd3c9 · 2025-06-16T16:44:41.000-04:00
diff --git a/formatter/generic/genericformatter.cpp b/formatter/generic/genericformatter.cpp
@@ -18,6 +18,7 @@ enum ItemType
 	StatementSeparator,
 	StringComponent,
 	StringSeparator,
+	StringSpace,
 	FormatSpecifier,
 	EscapeSequence,
 	Group,
@@ -254,127 +255,75 @@ static vector<Item> CreateStatementItems(const vector<Item>& items)
 	return result;
 }
 
-static vector<InstructionTextToken> SeparateStringTokens(
-	const InstructionTextToken& unprocessedStringToken
-)
+static vector<InstructionTextToken> ParseStringToken(
+    const InstructionTextToken& unprocessedStringToken,
+    const size_t maxParsingLength)
 {
-	// Takes a StringToken and breaks it into sub-StringTokens along boundaries of punctuation
-	// and spaces
-	//
-	// Ex.
-	// "this.that" -> {"this", ".", "that"}
-	// "format%llxsomething" -> {"format", "%llx", "something"}
-	// "meep\n"moop" -> {"meep", "\n", "moop"}
-
-	vector<InstructionTextToken> result;
-	string current;
-	size_t i = 0;
-	while (i < unprocessedStringToken.text.size())
-	{
-		char c = unprocessedStringToken.text[i];
-
-		// Handle format specifiers
-		if (c == '%')
-		{
-			if (!current.empty())
-			{
-				result.push_back(InstructionTextToken(StringToken, current));
-				current.clear();
-			}
+    const auto& src = unprocessedStringToken.text;
+    const size_t tail = src.size();
 
-			string format = "%";
-			i++;
-			while (i < unprocessedStringToken.text.size())
-			{
-				c = unprocessedStringToken.text[i];
-				if (!isalnum(c) && c != '.' && c != '-')
-					break;
-				format += c;
-				i++;
-			}
-			result.push_back(InstructionTextToken(StringToken, format));
-			continue;
-		}
+	// Max parsing length set for performance reasons, increase at your own peril!
+    if (tail > maxParsingLength)
+        return { unprocessedStringToken };
 
-		// Handle escape sequences
-		if (c == '\\')
-		{
-			if (!current.empty())
-			{
-				result.push_back(InstructionTextToken(StringToken, current));
-				current.clear();
-			}
+    vector<InstructionTextToken> result;
+    size_t curStart = 0, curEnd = 0;
+    auto ConstructToken = [&](size_t start, size_t end) {
+        result.emplace_back(StringToken, string(src.substr(start, end - start)));
+    };
 
-			string escape = "\\";
-			if (i + 1 < unprocessedStringToken.text.size())
-			{
-				escape += unprocessedStringToken.text[i + 1];
-				i += 2;
-			}
-			else
-				i++;
-			result.push_back(InstructionTextToken(StringToken, escape));
-			continue;
-		}
-
-		// Handle punctuation and spaces
-		if (c == ',' || c == '.' || c == ':' || c == ';')
-		{
-			if (!current.empty())
-			{
-				result.push_back(InstructionTextToken(StringToken, current));
-				current.clear();
-			}
-
-			string repeated;
-			repeated += c;
-			while (i + 1 < unprocessedStringToken.text.size())
-			{
-				char next = unprocessedStringToken.text[i + 1];
-				if (next == ',' || next == '.' || next == ':' || next == ';')
-				{
-					repeated += next;
-					i++;
-				}
-				else
-					break;
-			}
-			result.push_back(InstructionTextToken(StringToken, repeated));
-		}
-		else if (isspace(c))
-		{
-			if (!current.empty())
-			{
-				result.push_back(InstructionTextToken(StringToken, current));
-				current.clear();
-			}
-
-			string repeated;
-			repeated += c;
-			while (i + 1 < unprocessedStringToken.text.size())
-			{
-				char next = unprocessedStringToken.text[i + 1];
-				if (isspace(next))
-				{
-					repeated += next;
-					i++;
-				}
-				else
-					break;
-			}
-			result.push_back(InstructionTextToken(StringToken, repeated));
-		}
-		else
-		{
-			current += c;
-		}
-		i++;
-	}
+    while (curEnd < tail)
+    {
+        char c = src[curEnd];
+
+        if (c == '%')
+        {
+        	// Flush before format specifier
+            if (curStart < curEnd)
+                ConstructToken(curStart, curEnd);
+
+            size_t start = curEnd;
+            curEnd++;
+            while (curEnd < tail && (isalnum(src[curEnd]) || src[curEnd]=='.' || src[curEnd]=='-'))
+                curEnd++;
+            ConstructToken(start, curEnd);
+            curStart = curEnd;
+        }
+        else if (c == '\\')
+        {
+        	// Flush before escape sequence
+            if (curStart < curEnd)
+                ConstructToken(curStart, curEnd);
+
+            size_t start = curEnd;
+            curEnd++;  // consume '\'
+            if (curEnd < tail)
+                curEnd++;  // consume escaped char
+            ConstructToken(start, curEnd);
+            curStart = curEnd;
+        }
+        else if (c == ',' || c == '.' || c == ':' || c == ';' || isspace(c))
+        {
+        	// Flush before punctuation
+            if (curStart < curEnd)
+                ConstructToken(curStart, curEnd);
+			// Group together repeated punctuation
+            size_t start = curEnd;
+            while (curEnd < tail && src[curEnd] == c)
+                curEnd++;
+            ConstructToken(start, curEnd);
+            curStart = curEnd;
+        }
+        else
+        {
+            curEnd++;
+        }
+    }
 
-	if (!current.empty())
-		result.push_back(InstructionTextToken(StringToken, current));
+    if (curStart < curEnd)
+        ConstructToken(curStart, curEnd);
 
-	return result;
+    return result;
 }
 
 static vector<Item> CreateStringGroups(const vector<Item>& items)
@@ -846,7 +795,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
 				break;
 			case StringToken:
 			{
-				vector<InstructionTextToken> stringTokens = SeparateStringTokens(token);
+				vector<InstructionTextToken> stringTokens = ParseStringToken(token, 512);
 				for (size_t k = 0; k < stringTokens.size(); k++)
 				{
 					InstructionTextToken subToken = stringTokens[k];