Skip to content

Commit b4b4221

Browse files
authored
fix: Sanitize hidden Unicode characters from user and tool inputs (#2435)
* add a sanitize_unicode_tag method * add ut * clean execute_bash result * fmt * exclude fffd when fileter * import saitize func * fix typo
1 parent 0b64802 commit b4b4221

File tree

5 files changed

+92
-5
lines changed

5 files changed

+92
-5
lines changed

crates/chat-cli/src/cli/chat/cli/usage.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ impl UsageArgs {
106106
)),
107107
// Context files
108108
style::SetForegroundColor(Color::DarkCyan),
109-
// add a nice visual to mimic "tiny" progress, so the overral progress bar doesn't look too
109+
// add a nice visual to mimic "tiny" progress, so the overrall progress bar doesn't look too
110110
// empty
111111
style::Print("|".repeat(if context_width == 0 && *context_token_count > 0 {
112112
1

crates/chat-cli/src/cli/chat/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ use crate::cli::chat::cli::prompts::{
138138
GetPromptError,
139139
PromptsSubcommand,
140140
};
141+
use crate::cli::chat::util::sanitize_unicode_tags;
141142
use crate::database::settings::Setting;
142143
use crate::mcp_client::Prompt;
143144
use crate::os::Os;
@@ -1585,7 +1586,7 @@ impl ChatSession {
15851586

15861587
async fn handle_input(&mut self, os: &mut Os, mut user_input: String) -> Result<ChatState, ChatError> {
15871588
queue!(self.stderr, style::Print('\n'))?;
1588-
1589+
user_input = sanitize_unicode_tags(&user_input);
15891590
let input = user_input.trim();
15901591

15911592
// handle image path

crates/chat-cli/src/cli/chat/tools/execute/mod.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use crate::cli::agent::{
1313
Agent,
1414
PermissionEvalResult,
1515
};
16+
use crate::cli::chat::sanitize_unicode_tags;
1617
use crate::cli::chat::tools::{
1718
InvokeOutput,
1819
MAX_TOOL_RESPONSE_SIZE,
@@ -120,10 +121,13 @@ impl ExecuteCommand {
120121

121122
pub async fn invoke(&self, output: &mut impl Write) -> Result<InvokeOutput> {
122123
let output = run_command(&self.command, MAX_TOOL_RESPONSE_SIZE / 3, Some(output)).await?;
124+
let clean_stdout = sanitize_unicode_tags(&output.stdout);
125+
let clean_stderr = sanitize_unicode_tags(&output.stderr);
126+
123127
let result = serde_json::json!({
124128
"exit_status": output.exit_status.unwrap_or(0).to_string(),
125-
"stdout": output.stdout,
126-
"stderr": output.stderr,
129+
"stdout": clean_stdout,
130+
"stderr": clean_stderr,
127131
});
128132

129133
Ok(InvokeOutput {

crates/chat-cli/src/cli/chat/tools/fs_read.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,16 @@ use crate::cli::agent::{
3737
Agent,
3838
PermissionEvalResult,
3939
};
40-
use crate::cli::chat::CONTINUATION_LINE;
4140
use crate::cli::chat::tools::display_purpose;
4241
use crate::cli::chat::util::images::{
4342
handle_images_from_paths,
4443
is_supported_image_type,
4544
pre_process,
4645
};
46+
use crate::cli::chat::{
47+
CONTINUATION_LINE,
48+
sanitize_unicode_tags,
49+
};
4750
use crate::os::Os;
4851

4952
#[derive(Debug, Clone, Deserialize)]
@@ -451,6 +454,7 @@ impl FsLine {
451454
debug!(?path, "Reading");
452455
let file_bytes = os.fs.read(&path).await?;
453456
let file_content = String::from_utf8_lossy(&file_bytes);
457+
let file_content = sanitize_unicode_tags(&file_content);
454458
let line_count = file_content.lines().count();
455459
let (start, end) = (
456460
convert_negative_index(line_count, self.start_line()),
@@ -559,6 +563,7 @@ impl FsSearch {
559563

560564
let file_bytes = os.fs.read(&file_path).await?;
561565
let file_content = String::from_utf8_lossy(&file_bytes);
566+
let file_content = sanitize_unicode_tags(&file_content);
562567
let lines: Vec<&str> = LinesWithEndings::from(&file_content).collect();
563568

564569
let mut results = Vec::new();

crates/chat-cli/src/cli/chat/util/mod.rs

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,48 @@ pub fn animate_output(output: &mut impl Write, bytes: &[u8]) -> Result<(), ChatE
5959
Ok(())
6060
}
6161

62+
/// Returns `true` if the character is from an invisible or control Unicode range
63+
/// that is considered unsafe for LLM input. These rarely appear in normal input,
64+
/// so stripping them is generally safe.
65+
/// The replacement character U+FFFD (�) is preserved to indicate invalid bytes.
66+
fn is_hidden(c: char) -> bool {
67+
match c {
68+
'\u{E0000}'..='\u{E007F}' | // TAG characters (used for hidden prompts)
69+
'\u{200B}'..='\u{200F}' | // zero-width space, ZWJ, ZWNJ, RTL/LTR marks
70+
'\u{2028}'..='\u{202F}' | // line / paragraph separators, narrow NB-SP
71+
'\u{205F}'..='\u{206F}' | // format control characters
72+
'\u{FFF0}'..='\u{FFFC}' |
73+
'\u{FFFE}'..='\u{FFFF}' // Specials block (non-characters)
74+
=> true,
75+
_ => false,
76+
}
77+
}
78+
79+
/// Remove hidden / control characters from `text`.
80+
///
81+
/// * `text` – raw user input or file content
82+
///
83+
/// The function keeps things **O(n)** with a single allocation and logs how many
84+
/// characters were dropped. 400 KB worst-case size ⇒ sub-millisecond runtime.
85+
pub fn sanitize_unicode_tags(text: &str) -> String {
86+
let mut removed = 0;
87+
let out: String = text
88+
.chars()
89+
.filter(|&c| {
90+
let bad = is_hidden(c);
91+
if bad {
92+
removed += 1;
93+
}
94+
!bad
95+
})
96+
.collect();
97+
98+
if removed > 0 {
99+
tracing::debug!("Detected and removed {} hidden chars", removed);
100+
}
101+
out
102+
}
103+
62104
/// Play the terminal bell notification sound
63105
pub fn play_notification_bell(requires_confirmation: bool) {
64106
// Don't play bell for tools that don't require confirmation
@@ -249,4 +291,39 @@ mod tests {
249291
}
250292
assert_eq!(files.len(), 1);
251293
}
294+
#[test]
295+
fn is_hidden_recognises_all_ranges() {
296+
let samples = ['\u{E0000}', '\u{200B}', '\u{2028}', '\u{205F}', '\u{FFF0}'];
297+
298+
for ch in samples {
299+
assert!(is_hidden(ch), "char U+{:X} should be hidden", ch as u32);
300+
}
301+
302+
for ch in ['a', '你', '\u{03A9}'] {
303+
assert!(!is_hidden(ch), "char {:?} should NOT be hidden", ch);
304+
}
305+
}
306+
307+
#[test]
308+
fn sanitize_keeps_visible_text_intact() {
309+
let visible = "Rust 🦀 > C";
310+
assert_eq!(sanitize_unicode_tags(visible), visible);
311+
}
312+
313+
#[test]
314+
fn sanitize_handles_large_mixture() {
315+
let visible_block = "abcXYZ";
316+
let hidden_block = "\u{200B}\u{E0000}";
317+
let mut big_input = String::new();
318+
for _ in 0..50_000 {
319+
big_input.push_str(visible_block);
320+
big_input.push_str(hidden_block);
321+
}
322+
323+
let result = sanitize_unicode_tags(&big_input);
324+
325+
assert_eq!(result.len(), 50_000 * visible_block.len());
326+
327+
assert!(result.chars().all(|c| !is_hidden(c)));
328+
}
252329
}

0 commit comments

Comments
 (0)