Skip to content

Commit ba12b64

Browse files
committed
Handle non-UTF8 encodings better when preparing unified diffs for the frontend (#10475)
1 parent 0d0da52 commit ba12b64

File tree

6 files changed

+79
-4
lines changed

6 files changed

+79
-4
lines changed

Cargo.lock

Lines changed: 14 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/but-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ gitbutler-serde.workspace = true
2828
gitbutler-error.workspace = true
2929
uuid.workspace = true
3030
toml.workspace = true
31+
chardetng = "0.1.17"
3132

3233
[dev-dependencies]
3334
but-testsupport.workspace = true

crates/but-core/src/unified_diff.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ pub struct DiffHunk {
3030
///
3131
/// The line separator is the one used in the original file and may be `LF` or `CRLF`.
3232
/// Note that the file-portion of the header isn't used here.
33+
///
34+
/// Also note that this has possibly been decoded lossily, assuming UTF8 if the encoding couldn't be determined,
35+
/// replacing invalid codepoints with markers.
3336
#[serde(serialize_with = "gitbutler_serde::bstring_lossy::serialize")]
3437
pub diff: BString,
3538
}
@@ -181,7 +184,7 @@ impl UnifiedDiff {
181184
let mut buf = Vec::with_capacity(header_str.len() + hunk.len());
182185
buf.extend_from_slice(header_str.as_bytes());
183186
buf.extend_from_slice(hunk);
184-
buf.into()
187+
detect_and_convert_to_utf8(buf.into())
185188
},
186189
});
187190
Ok(())
@@ -234,6 +237,22 @@ impl UnifiedDiff {
234237
}
235238
}
236239

240+
/// Detect the encoding of the given byte content and convert it to UTF-8, after attempting to guess its encoding.
241+
/// Even if decoding failed, we always return the original `content` in the wirst case.
242+
fn detect_and_convert_to_utf8(content: BString) -> BString {
243+
// Use chardet to detect the encoding
244+
let mut detect = chardetng::EncodingDetector::new();
245+
detect.feed(&content, true);
246+
let (encoding, high_confidence) = detect.guess_assess(None, true);
247+
if encoding.name() == "UTF-8" || !high_confidence {
248+
return content;
249+
}
250+
251+
// Convert to UTF-8
252+
let (decoded, _actual_encoding, _used_replacement_chars) = encoding.decode(&content);
253+
decoded.into_owned().into()
254+
}
255+
237256
fn compute_line_changes(hunks: &Vec<DiffHunk>) -> (u32, u32) {
238257
let mut lines_added = 0;
239258
let mut lines_removed = 0;

crates/but-core/tests/core/diff/worktree_changes.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -967,6 +967,45 @@ fn added_modified_in_worktree() -> Result<()> {
967967
Ok(())
968968
}
969969

970+
#[test]
971+
fn non_utf8_decoding() -> Result<()> {
972+
let repo = repo("non-utf8-encodings")?;
973+
let actual = diff::worktree_changes(&repo)?;
974+
// Let's have one sample file per codepage with reasonable amounts of text for inference.
975+
insta::assert_debug_snapshot!(actual, @r#"
976+
WorktreeChanges {
977+
changes: [
978+
TreeChange {
979+
path: "windows1252",
980+
status: Addition {
981+
state: ChangeState {
982+
id: Sha1(0000000000000000000000000000000000000000),
983+
kind: Blob,
984+
},
985+
is_untracked: true,
986+
},
987+
},
988+
],
989+
ignored_changes: [],
990+
}
991+
"#);
992+
insta::assert_debug_snapshot!(unified_diffs(actual, &repo)?, @r#"
993+
[
994+
Patch {
995+
hunks: [
996+
DiffHunk("@@ -1,0 +1,1 @@
997+
+€ÄÀ
998+
"),
999+
],
1000+
is_result_of_binary_to_text_conversion: false,
1001+
lines_added: 1,
1002+
lines_removed: 0,
1003+
},
1004+
]
1005+
"#);
1006+
Ok(())
1007+
}
1008+
9701009
#[test]
9711010
fn modified_in_index() -> Result<()> {
9721011
let repo = repo("modified-in-index")?;

crates/but-core/tests/fixtures/worktree-changes.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,8 @@ cat <<EOF >>.git/config
310310
textconv = "shift; echo ho"
311311
EOF
312312
)
313+
314+
git init non-utf8-encodings
315+
(cd non-utf8-encodings
316+
printf '\x80\xc4\xc0' > windows1252
317+
)

crates/but-cursor/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,6 @@ pub async fn handle_stop(nightly: bool) -> anyhow::Result<CursorHookOutput> {
281281
}
282282
}
283283

284-
// dbg!(input);
285284
Ok(CursorHookOutput::default())
286285
}
287286

0 commit comments

Comments
 (0)