Fix/word internal underscores (#125)

johnmccambridge7 · web-flow · commit 266fa2be95a8 · 2025-09-10T21:09:58.000-07:00
* handle case for internal underscore

* fix: word-internal underscores incorrectly treated as incomplete markdown

* chore: remove redundant coverage

* fix: word internal underscores

* fix: handle non-unicode
diff --git a/.changeset/fix-word-internal-underscores.md b/.changeset/fix-word-internal-underscores.md
@@ -0,0 +1,13 @@
+---
+"streamdown": patch
+---
+
+Fix word-internal underscores being incorrectly treated as incomplete markdown
+
+Previously, underscores used as word separators (e.g., `hello_world`, `snake_case`) were incorrectly identified as incomplete italic markdown, causing an extra underscore to be appended. This fix:
+
+- Detects when underscores are between word characters and treats them as literals
+- Preserves the streaming markdown completion for genuine incomplete italics (e.g., `_italic text`)
+- Correctly handles trailing newlines when completing italic formatting
+
+Fixes the issue where `hello_world` would become `hello_world_` when `parseIncompleteMarkdown` was enabled.
diff --git a/packages/streamdown/__tests__/parse-incomplete-markdown.test.ts b/packages/streamdown/__tests__/parse-incomplete-markdown.test.ts
@@ -190,6 +190,29 @@ describe("parseIncompleteMarkdown", () => {
         "some\\_text_with_underscores"
       );
     });
+
+    it("should handle mixed escaped and unescaped underscores correctly", () => {
+      expect(parseIncompleteMarkdown("\\_escaped\\_ and _unescaped")).toBe(
+        "\\_escaped\\_ and _unescaped_"
+      );
+
+      expect(parseIncompleteMarkdown("Start \\_escaped\\_ middle _incomplete")).toBe(
+        "Start \\_escaped\\_ middle _incomplete_"
+      );
+
+      expect(parseIncompleteMarkdown("\\_fully\\_escaped\\_")).toBe(
+        "\\_fully\\_escaped\\_"
+      );
+
+      expect(parseIncompleteMarkdown("\\_escaped\\_ _complete_ pair")).toBe(
+        "\\_escaped\\_ _complete_ pair"
+      );
+    });
+
+    it("should handle underscores with unicode word characters", () => {
+      expect(parseIncompleteMarkdown("café_price")).toBe("café_price");
+      expect(parseIncompleteMarkdown("naïve_approach")).toBe("naïve_approach");
+    });
   });
 
   describe("inline code formatting (`)", () => {
diff --git a/packages/streamdown/__tests__/underscore-bug.test.tsx b/packages/streamdown/__tests__/underscore-bug.test.tsx
@@ -0,0 +1,130 @@
+import { describe, expect, it } from "vitest";
+import { parseIncompleteMarkdown } from "../lib/parse-incomplete-markdown";
+
+describe("parseIncompleteMarkdown - word-internal underscores", () => {
+  describe("underscores as word separators", () => {
+    it("should handle single underscore between words", () => {
+      const input = "hello_world";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("hello_world");
+    });
+
+    it("should handle multiple underscores between words", () => {
+      const input = "hello_world_test";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("hello_world_test");
+    });
+
+    it("should handle CONSTANT_CASE", () => {
+      const input = "MAX_VALUE";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("MAX_VALUE");
+    });
+
+    it("should handle multiple snake_case words in text", () => {
+      const input = "The user_name and user_email are required";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("The user_name and user_email are required");
+    });
+
+    it("should handle underscore in URLs", () => {
+      const input = "Visit https://example.com/path_with_underscore";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("Visit https://example.com/path_with_underscore");
+    });
+
+    it("should handle numbers with underscores", () => {
+      const input = "The value is 1_000_000";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("The value is 1_000_000");
+    });
+  });
+
+  describe("incomplete italic formatting", () => {
+    it("should complete italic at word boundary", () => {
+      const input = "_italic text";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("_italic text_");
+    });
+
+    it("should complete italic with punctuation", () => {
+      const input = "This is _italic";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("This is _italic_");
+    });
+
+    it("should complete italic before newline", () => {
+      const input = "_italic\n";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("_italic_\n");
+    });
+  });
+
+  describe("edge cases", () => {
+    it("should handle underscore at end of word (ambiguous case)", () => {
+      const input = "word_";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("word_");
+    });
+
+    it("should handle leading underscore in identifier", () => {
+      const input = "_privateVariable";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("_privateVariable_");
+    });
+
+    it("should handle code with underscores in markdown", () => {
+      const input = "Use `variable_name` in your code";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("Use `variable_name` in your code");
+    });
+
+    it("should handle mixed snake_case and italic", () => {
+      const input = "The variable_name is _important";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("The variable_name is _important_");
+    });
+
+    it("should not modify complete italic pairs", () => {
+      const input = "_complete italic_ and some_other_text";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("_complete italic_ and some_other_text");
+    });
+
+    it("should handle underscore in code blocks", () => {
+      const input = "```\nfunction_name()\n```";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("```\nfunction_name()\n```");
+    });
+
+    it("should handle HTML attributes with underscores", () => {
+      const input = '<div data_attribute="value">';
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe('<div data_attribute="value">');
+    });
+  });
+
+  describe("real-world scenarios", () => {
+    it("should handle Python-style names", () => {
+      const input = "__init__ and __main__ are special";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("__init__ and __main__ are special");
+    });
+
+    it("should handle markdown in sentences with snake_case", () => {
+      const input = "The user_id field stores the _unique identifier";
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe("The user_id field stores the _unique identifier_");
+    });
+
+    it("should handle the original bug report case", () => {
+      const input = `hello_world
+
+<a href="example_link"/>`;
+      const result = parseIncompleteMarkdown(input);
+      expect(result).toBe(input);
+      expect(result).not.toMatch(/hello_world_/);
+      expect(result).not.toMatch(/_$/);
+    });
+  });
+});
diff --git a/packages/streamdown/lib/parse-incomplete-markdown.ts b/packages/streamdown/lib/parse-incomplete-markdown.ts
@@ -254,6 +254,10 @@ const countSingleUnderscores = (text: string): number => {
       if (isWithinMathBlock(text, index)) {
         return acc;
       }
+      // Skip if underscore is word-internal (between word characters)
+      if (prevChar && nextChar && /[\p{L}\p{N}_]/u.test(prevChar) && /[\p{L}\p{N}_]/u.test(nextChar)) {
+        return acc;
+      }
       if (prevChar !== "_" && nextChar !== "_") {
         return acc + 1;
       }
@@ -272,15 +276,28 @@ const handleIncompleteSingleUnderscoreItalic = (text: string): string => {
   const singleUnderscoreMatch = text.match(singleUnderscorePattern);
 
   if (singleUnderscoreMatch) {
-    // Find the first single underscore position (not part of __)
+    // Find the first single underscore position (not part of __ and not word-internal)
     let firstSingleUnderscoreIndex = -1;
     for (let i = 0; i < text.length; i++) {
       if (
         text[i] === "_" &&
         text[i - 1] !== "_" &&
         text[i + 1] !== "_" &&
+        text[i - 1] !== "\\" &&
         !isWithinMathBlock(text, i)
       ) {
+        // Check if underscore is word-internal (between word characters)
+        const prevChar = i > 0 ? text[i - 1] : "";
+        const nextChar = i < text.length - 1 ? text[i + 1] : "";
+        if (
+          prevChar &&
+          nextChar &&
+          /[\p{L}\p{N}_]/u.test(prevChar) &&
+          /[\p{L}\p{N}_]/u.test(nextChar)
+        ) {
+          continue;
+        }
+
         firstSingleUnderscoreIndex = i;
         break;
       }
@@ -306,6 +323,15 @@ const handleIncompleteSingleUnderscoreItalic = (text: string): string => {
 
     const singleUnderscores = countSingleUnderscores(text);
     if (singleUnderscores % 2 === 1) {
+      // If text ends with newline(s), insert underscore before them
+      const trailingNewlineMatch = text.match(/\n+$/);
+      if (trailingNewlineMatch) {
+        const textBeforeNewlines = text.slice(
+          0,
+          -trailingNewlineMatch[0].length
+        );
+        return `${textBeforeNewlines}_${trailingNewlineMatch[0]}`;
+      }
       return `${text}_`;
     }
   }