Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ datafusion-sql = { path = "datafusion/sql", version = "52.1.0" }
datafusion-substrait = { path = "datafusion/substrait", version = "52.1.0" }

doc-comment = "0.3"
encoding_rs = "0.8"
env_logger = "0.11"
flate2 = "1.1.9"
futures = "0.3"
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ Optional features:

- `avro`: support for reading the [Apache Avro] format
- `backtrace`: include backtrace information in error messages
- `encoding_rs`: support for reading files with character encodings other than UTF-8
- `parquet_encryption`: support for using [Parquet Modular Encryption]
- `serde`: enable arrow-schema's `serde` feature

Expand Down
8 changes: 8 additions & 0 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2921,6 +2921,7 @@ config_namespace! {
///
/// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting.
pub newlines_in_values: Option<bool>, default = None
pub charset: Option<String>, default = None
pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED
/// Compression level for the output file. The valid range depends on the
/// compression algorithm:
Expand Down Expand Up @@ -3033,6 +3034,13 @@ impl CsvOptions {
self
}

/// Specifies the character encoding the file is encoded with.
/// - defaults to UTF-8
pub fn with_charset(mut self, charset: impl Into<String>) -> Self {
self.charset = Some(charset.into());
self
}

/// Set a `CompressionTypeVariant` of CSV
/// - defaults to `CompressionTypeVariant::UNCOMPRESSED`
pub fn with_file_compression_type(
Expand Down
2 changes: 2 additions & 0 deletions datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ default = [
"recursive_protection",
"sql",
]
encoding_rs = ["datafusion-datasource-csv/encoding_rs"]
encoding_expressions = ["datafusion-functions/encoding_expressions"]
# Used for testing ONLY: causes all values to hash to the same value (test for collisions)
force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"]
Expand Down Expand Up @@ -171,6 +172,7 @@ datafusion-functions-window-common = { workspace = true }
datafusion-macros = { workspace = true }
datafusion-physical-optimizer = { workspace = true }
doc-comment = { workspace = true }
encoding_rs = { workspace = true }
env_logger = { workspace = true }
glob = { workspace = true }
insta = { workspace = true }
Expand Down
45 changes: 45 additions & 0 deletions datafusion/core/src/datasource/file_format/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1619,4 +1619,49 @@ mod tests {

Ok(())
}

#[cfg(feature = "encoding_rs")]
#[tokio::test]
async fn test_read_shift_jis_csv() -> Result<()> {
use std::io::Write;

// Encode a test CSV into SHIFT-JIS
let data = r#"ID,Name,Price,Description,Notes
001,山本 大輔,\2945,桜餅と抹茶のセット,数量限定
002,加藤 由美,\9575,和牛ステーキセット,取り寄せ中
003,田中 太郎,\1853,抹茶アイスクリーム,ポイント2倍
004,渡辺 さくら,\9494,和牛ステーキセット,送料無料
005,加藤 由美,\558,和牛ステーキセット,新商品
006,渡辺 さくら,\7704,天ぷら盛り合わせ,割引対象外
007,田中 太郎,\212,桜餅と抹茶のセット,取り寄せ中
008,中村 陽子,\8847,和牛ステーキセット,期間限定
009,伊藤 健太,\5997,季節の野菜カレー,お一人様1点限り
010,高橋 美咲,\6594,季節の野菜カレー,冷凍保存"#;
let (data, _, _) = encoding_rs::SHIFT_JIS.encode(data);

// Write the CSV data to a temp file
let mut tmp = tempfile::Builder::new().suffix(".csv").tempfile()?;
tmp.write_all(&*data)?;
let path = tmp.path().to_str().unwrap().to_string();

// Read the file
let ctx = SessionContext::new();
let opts = CsvReadOptions::new().has_header(true).charset("SHIFT-JIS");
let batches = ctx.read_csv(path, opts).await?.collect().await?;

// Check
let num_rows = batches.iter().map(|b| b.num_rows()).sum::<usize>();
assert_eq!(num_rows, 10);

let names = batches[0]
.column(1)
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
assert_eq!(names.value(0), "山本 大輔");
assert_eq!(names.value(1), "加藤 由美");
assert_eq!(names.value(2), "田中 太郎");

Ok(())
}
}
10 changes: 10 additions & 0 deletions datafusion/core/src/datasource/file_format/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ pub struct CsvReadOptions<'a> {
pub file_extension: &'a str,
/// Partition Columns
pub table_partition_cols: Vec<(String, DataType)>,
/// Character encoding
pub charset: Option<&'a str>,
/// File compression type
pub file_compression_type: FileCompressionType,
/// Indicates how the file is sorted
Expand Down Expand Up @@ -118,6 +120,7 @@ impl<'a> CsvReadOptions<'a> {
newlines_in_values: false,
file_extension: DEFAULT_CSV_EXTENSION,
table_partition_cols: vec![],
charset: None,
file_compression_type: FileCompressionType::UNCOMPRESSED,
file_sort_order: vec![],
comment: None,
Expand Down Expand Up @@ -209,6 +212,12 @@ impl<'a> CsvReadOptions<'a> {
self
}

/// Configure the character set encoding
pub fn charset(mut self, charset: &'a str) -> Self {
self.charset = Some(charset);
self
}

/// Configure file compression type
pub fn file_compression_type(
mut self,
Expand Down Expand Up @@ -633,6 +642,7 @@ impl ReadOptions<'_> for CsvReadOptions<'_> {
.with_terminator(self.terminator)
.with_newlines_in_values(self.newlines_in_values)
.with_schema_infer_max_rec(self.schema_infer_max_records)
.with_charset(self.charset.map(ToOwned::to_owned))
.with_file_compression_type(self.file_compression_type.to_owned())
.with_null_regex(self.null_regex.clone())
.with_truncated_rows(self.truncated_rows);
Expand Down
1 change: 1 addition & 0 deletions datafusion/datasource-csv/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ datafusion-expr = { workspace = true }
datafusion-physical-expr-common = { workspace = true }
datafusion-physical-plan = { workspace = true }
datafusion-session = { workspace = true }
encoding_rs = { workspace = true, optional = true }
futures = { workspace = true }
object_store = { workspace = true }
regex = { workspace = true }
Expand Down
Loading