Skip to content

Commit c928f69

Browse files
committed
feat(file-decode): support files in UTF-16/UTF-32 by detecting BOM
1 parent 1f8c34d commit c928f69

File tree

8 files changed

+23
-6
lines changed

8 files changed

+23
-6
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,4 @@ azure_storage_blobs = { version = "0.21.0", default-features = false, features =
152152
] }
153153
serde_path_to_error = "0.1.17"
154154
expect-test = "1.5.0"
155+
encoding_rs = "0.8.35"

src/ops/sources/amazon_s3.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ impl SourceExecutor for Executor {
152152
Some(SourceValue::Existence(if self.binary {
153153
fields_value!(bytes.to_vec())
154154
} else {
155-
fields_value!(String::from_utf8_lossy(&bytes).to_string())
155+
let (s, _) = utils::bytes_decode::bytes_to_string(&bytes);
156+
fields_value!(s)
156157
}))
157158
} else {
158159
None

src/ops/sources/azure_blob.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,8 @@ impl SourceExecutor for Executor {
147147
Some(SourceValue::Existence(if self.binary {
148148
fields_value!(bytes)
149149
} else {
150-
fields_value!(String::from_utf8_lossy(&bytes).to_string())
150+
let (s, _) = utils::bytes_decode::bytes_to_string(&bytes);
151+
fields_value!(s)
151152
}))
152153
} else {
153154
None

src/ops/sources/google_drive.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -400,9 +400,8 @@ impl SourceExecutor for Executor {
400400
if self.binary {
401401
content.to_bytes().to_vec().into()
402402
} else {
403-
String::from_utf8_lossy(&content.to_bytes())
404-
.to_string()
405-
.into()
403+
let (s, _) = utils::bytes_decode::bytes_to_string(&content.to_bytes());
404+
s.into()
406405
},
407406
];
408407
Some(SourceValue::Existence(FieldValues { fields }))

src/ops/sources/local_file.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ impl SourceExecutor for Executor {
9797
let content = if self.binary {
9898
fields_value!(content)
9999
} else {
100-
fields_value!(String::from_utf8_lossy(&content).to_string())
100+
let (s, _) = utils::bytes_decode::bytes_to_string(&content);
101+
fields_value!(s)
101102
};
102103
Some(SourceValue::Existence(content))
103104
}

src/utils/bytes_decode.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
use encoding_rs::Encoding;
2+
3+
pub fn bytes_to_string(bytes: &[u8]) -> (String, bool) {
4+
// 1) BOM sniff first (definitive for UTF-8/16; UTF-32 is not supported here).
5+
if let Some((enc, bom_len)) = Encoding::for_bom(bytes) {
6+
let (cow, had_errors) = enc.decode_without_bom_handling(&bytes[bom_len..]);
7+
return (cow.into_owned(), had_errors);
8+
}
9+
// 2) Otherwise, try UTF-8 (accepts input with or without a UTF-8 BOM).
10+
let (cow, had_errors) = encoding_rs::UTF_8.decode_with_bom_removal(bytes);
11+
(cow.into_owned(), had_errors)
12+
}

src/utils/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
pub mod bytes_decode;
12
pub mod concur_control;
23
pub mod db;
34
pub mod deser;

0 commit comments

Comments
 (0)