-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Fix: Improve HTML entity unescaping for Gemini and other models #7891
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,16 +73,20 @@ export async function writeToFileTool( | |
| cline.diffViewProvider.editType = fileExists ? "modify" : "create" | ||
| } | ||
|
|
||
| // pre-processing newContent for cases where weaker models might add artifacts like markdown codeblock markers (deepseek/llama) or extra escape characters (gemini) | ||
| // Pre-processing newContent for cases where models might add artifacts | ||
| // Some models (DeepSeek/Llama) add markdown codeblock markers | ||
| // Others (Gemini) return content with HTML-escaped characters | ||
| if (newContent.startsWith("```")) { | ||
| // cline handles cases where it includes language specifiers like ```python ```js | ||
| // Handle cases where it includes language specifiers like ```python ```js | ||
| newContent = newContent.split("\n").slice(1).join("\n") | ||
| } | ||
|
|
||
| if (newContent.endsWith("```")) { | ||
| newContent = newContent.split("\n").slice(0, -1).join("\n") | ||
| } | ||
|
|
||
| // Unescape HTML entities for non-Claude models (e.g., Gemini, DeepSeek, Llama) | ||
| // These models may return content with escaped characters that need to be unescaped | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment consistency suggestion here - could be more generic rather than listing specific model names. |
||
| if (!cline.api.getModel().id.includes("claude")) { | ||
| newContent = unescapeHtmlEntities(newContent) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| import { describe, it, expect } from "vitest" | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we consider merging this test file with the existing |
||
| import { unescapeHtmlEntities } from "../text-normalization" | ||
|
|
||
| describe("Extended HTML entity unescaping", () => { | ||
| describe("unescapeHtmlEntities", () => { | ||
| it("unescapes alternative apostrophe encoding", () => { | ||
| const input = "It's working" | ||
| const expected = "It's working" | ||
| expect(unescapeHtmlEntities(input)).toBe(expected) | ||
| }) | ||
|
|
||
| it("unescapes forward slash", () => { | ||
| const input = "path/to/file" | ||
| const expected = "path/to/file" | ||
| expect(unescapeHtmlEntities(input)).toBe(expected) | ||
| }) | ||
|
|
||
| it("unescapes backslash", () => { | ||
| const input = "C:\Users\file" | ||
| const expected = "C:\\Users\\file" | ||
| expect(unescapeHtmlEntities(input)).toBe(expected) | ||
| }) | ||
|
|
||
| it("unescapes backtick", () => { | ||
| const input = "`code`" | ||
| const expected = "`code`" | ||
| expect(unescapeHtmlEntities(input)).toBe(expected) | ||
| }) | ||
|
|
||
| it("unescapes non-breaking space", () => { | ||
| const input = "Hello World" | ||
| const expected = "Hello World" | ||
| expect(unescapeHtmlEntities(input)).toBe(expected) | ||
| }) | ||
|
|
||
| it("handles complex mixed content with all entity types", () => { | ||
| const input = | ||
| "<div class="test">It's a test/path\file with `code` & more</div>" | ||
| const expected = '<div class="test">It\'s a test/path\\file with `code` & more</div>' | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this double space intentional? The input has |
||
| expect(unescapeHtmlEntities(input)).toBe(expected) | ||
| }) | ||
|
|
||
| it("handles Gemini-style escaped markdown content", () => { | ||
| const input = | ||
| "```python\n<search>\ndef old_function():\n return 'old'\n</search>\n<replace>\ndef new_function():\n return 'new'\n</replace>\n```" | ||
| const expected = | ||
| "```python\n<search>\ndef old_function():\n return 'old'\n</search>\n<replace>\ndef new_function():\n return 'new'\n</replace>\n```" | ||
| expect(unescapeHtmlEntities(input)).toBe(expected) | ||
| }) | ||
|
|
||
| it("correctly orders ampersand unescaping to avoid double-unescaping", () => { | ||
| const input = "&lt;&gt;&amp;" | ||
| const expected = "<>&" | ||
| expect(unescapeHtmlEntities(input)).toBe(expected) | ||
| }) | ||
| }) | ||
| }) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -91,5 +91,10 @@ export function unescapeHtmlEntities(text: string): string { | |
| .replace(/"/g, '"') | ||
| .replace(/'/g, "'") | ||
| .replace(/'/g, "'") | ||
| .replace(/&/g, "&") | ||
| .replace(/'/g, "'") // Alternative apostrophe encoding | ||
| .replace(///g, "/") // Forward slash | ||
| .replace(/\/g, "\\") // Backslash | ||
| .replace(/`/g, "`") // Backtick | ||
| .replace(/ /g, " ") // Non-breaking space | ||
| .replace(/&/g, "&") // Must be last to avoid double-unescaping | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For high-frequency usage, would it be worth considering a single regex with a replacement function instead of chaining multiple |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor suggestion: Could we make this comment more generic? Instead of listing specific models, perhaps: "Unescape HTML entities for non-Claude models that may return content with escaped characters"