Skip to content

Commit 0bd8809

Browse files
authored
fix: csv schema_infer_max_records set to 0 return null datatype (#19432)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - close #19417 ## Rationale for this change - see #19417 - related to #17796 ## What changes are included in this PR? when schema_infer_max_records set to 0 in csv, return datatype as string ## Are these changes tested? add test case for schema_infer_max_records equal to 0 ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 33ac70d commit 0bd8809

File tree

3 files changed

+88
-28
lines changed

3 files changed

+88
-28
lines changed

datafusion/core/src/datasource/file_format/csv.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,4 +1536,32 @@ mod tests {
15361536

15371537
Ok(())
15381538
}
1539+
1540+
#[tokio::test]
1541+
async fn test_infer_schema_with_zero_max_records() -> Result<()> {
1542+
let session_ctx = SessionContext::new();
1543+
let state = session_ctx.state();
1544+
1545+
let root = format!("{}/csv", arrow_test_data());
1546+
let format = CsvFormat::default()
1547+
.with_has_header(true)
1548+
.with_schema_infer_max_rec(0); // Set to 0 to disable inference
1549+
let exec = scan_format(
1550+
&state,
1551+
&format,
1552+
None,
1553+
&root,
1554+
"aggregate_test_100.csv",
1555+
None,
1556+
None,
1557+
)
1558+
.await?;
1559+
1560+
// related to https://github.com/apache/datafusion/issues/19417
1561+
for f in exec.schema().fields() {
1562+
assert_eq!(*f.data_type(), DataType::Utf8);
1563+
}
1564+
1565+
Ok(())
1566+
}
15391567
}

datafusion/datasource-csv/src/file_format.rs

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,11 @@ impl CsvFormat {
211211

212212
/// Set a limit in terms of records to scan to infer the schema
213213
/// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
214+
///
215+
/// # Behavior when set to 0
216+
///
217+
/// When `max_rec` is set to 0, schema inference is disabled and all fields
218+
/// will be inferred as `Utf8` (string) type, regardless of their actual content.
214219
pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self {
215220
self.options.schema_infer_max_rec = Some(max_rec);
216221
self
@@ -529,6 +534,7 @@ impl CsvFormat {
529534
let mut column_names = vec![];
530535
let mut column_type_possibilities = vec![];
531536
let mut record_number = -1;
537+
let initial_records_to_read = records_to_read;
532538

533539
pin_mut!(stream);
534540

@@ -619,12 +625,31 @@ impl CsvFormat {
619625
}
620626
}
621627

622-
let schema = build_schema_helper(column_names, column_type_possibilities);
628+
let schema = build_schema_helper(
629+
column_names,
630+
column_type_possibilities,
631+
initial_records_to_read == 0,
632+
);
623633
Ok((schema, total_records_read))
624634
}
625635
}
626636

627-
fn build_schema_helper(names: Vec<String>, types: Vec<HashSet<DataType>>) -> Schema {
637+
/// Builds a schema from column names and their possible data types.
638+
///
639+
/// # Arguments
640+
///
641+
/// * `names` - Vector of column names
642+
/// * `types` - Vector of possible data types for each column (as HashSets)
643+
/// * `disable_inference` - When true, forces all columns with no inferred types to be Utf8.
644+
/// This should be set to true when `schema_infer_max_rec` is explicitly
645+
/// set to 0, indicating the user wants to skip type inference and treat
646+
/// all fields as strings. When false, columns with no inferred types
647+
/// will be set to Null, allowing schema merging to work properly.
648+
fn build_schema_helper(
649+
names: Vec<String>,
650+
types: Vec<HashSet<DataType>>,
651+
disable_inference: bool,
652+
) -> Schema {
628653
let fields = names
629654
.into_iter()
630655
.zip(types)
@@ -637,10 +662,17 @@ fn build_schema_helper(names: Vec<String>, types: Vec<HashSet<DataType>>) -> Sch
637662
data_type_possibilities.remove(&DataType::Null);
638663

639664
match data_type_possibilities.len() {
640-
// Return Null for columns with only nulls / empty files
641-
// This allows schema merging to work when reading folders
642-
// such files along with normal files.
643-
0 => Field::new(field_name, DataType::Null, true),
665+
// When no types were inferred (empty HashSet):
666+
// - If schema_infer_max_rec was explicitly set to 0, return Utf8
667+
// - Otherwise return Null (whether from reading null values or empty files)
668+
// This allows schema merging to work when reading folders with empty files
669+
0 => {
670+
if disable_inference {
671+
Field::new(field_name, DataType::Utf8, true)
672+
} else {
673+
Field::new(field_name, DataType::Null, true)
674+
}
675+
}
644676
1 => Field::new(
645677
field_name,
646678
data_type_possibilities.iter().next().unwrap().clone(),
@@ -832,7 +864,7 @@ mod tests {
832864
HashSet::from([DataType::Utf8]), // col5
833865
];
834866

835-
let schema = build_schema_helper(column_names, column_type_possibilities);
867+
let schema = build_schema_helper(column_names, column_type_possibilities, false);
836868

837869
// Verify schema has 5 columns
838870
assert_eq!(schema.fields().len(), 5);
@@ -862,7 +894,7 @@ mod tests {
862894
HashSet::from([DataType::Utf8]), // Should remain Utf8
863895
];
864896

865-
let schema = build_schema_helper(column_names, column_type_possibilities);
897+
let schema = build_schema_helper(column_names, column_type_possibilities, false);
866898

867899
// col1 should be Float64 due to Int64 + Float64 = Float64
868900
assert_eq!(*schema.field(0).data_type(), DataType::Float64);
@@ -880,7 +912,7 @@ mod tests {
880912
HashSet::from([DataType::Boolean, DataType::Int64, DataType::Utf8]), // Should resolve to Utf8 due to conflicts
881913
];
882914

883-
let schema = build_schema_helper(column_names, column_type_possibilities);
915+
let schema = build_schema_helper(column_names, column_type_possibilities, false);
884916

885917
// Should default to Utf8 for conflicting types
886918
assert_eq!(*schema.field(0).data_type(), DataType::Utf8);

docs/source/user-guide/sql/format_options.md

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -99,25 +99,25 @@ OPTIONS('COMPRESSION' 'gzip');
9999

100100
The following options are available when reading or writing CSV files. Note: If any unsupported option is specified, an error will be raised and the query will fail.
101101

102-
| Option | Description | Default Value |
103-
| -------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
104-
| COMPRESSION | Sets the compression that should be applied to the entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED. | UNCOMPRESSED |
105-
| HAS_HEADER | Sets if the CSV file should include column headers. If not set, uses session or system default. | None |
106-
| DELIMITER | Sets the character which should be used as the column delimiter within the CSV file. | `,` (comma) |
107-
| QUOTE | Sets the character which should be used for quoting values within the CSV file. | `"` (double quote) |
108-
| TERMINATOR | Sets the character which should be used as the line terminator within the CSV file. | None |
109-
| ESCAPE | Sets the character which should be used for escaping special characters within the CSV file. | None |
110-
| DOUBLE_QUOTE | Sets if quotes within quoted fields should be escaped by doubling them (e.g., `"aaa""bbb"`). | None |
111-
| NEWLINES_IN_VALUES | Sets if newlines in quoted values are supported. If not set, uses session or system default. | None |
112-
| DATE_FORMAT | Sets the format that dates should be encoded in within the CSV file. | None |
113-
| DATETIME_FORMAT | Sets the format that datetimes should be encoded in within the CSV file. | None |
114-
| TIMESTAMP_FORMAT | Sets the format that timestamps should be encoded in within the CSV file. | None |
115-
| TIMESTAMP_TZ_FORMAT | Sets the format that timestamps with timezone should be encoded in within the CSV file. | None |
116-
| TIME_FORMAT | Sets the format that times should be encoded in within the CSV file. | None |
117-
| NULL_VALUE | Sets the string which should be used to indicate null values within the CSV file. | None |
118-
| NULL_REGEX | Sets the regex pattern to match null values when loading CSVs. | None |
119-
| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer the schema. | None |
120-
| COMMENT | Sets the character which should be used to indicate comment lines in the CSV file. | None |
102+
| Option | Description | Default Value |
103+
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
104+
| COMPRESSION | Sets the compression that should be applied to the entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED. | UNCOMPRESSED |
105+
| HAS_HEADER | Sets if the CSV file should include column headers. If not set, uses session or system default. | None |
106+
| DELIMITER | Sets the character which should be used as the column delimiter within the CSV file. | `,` (comma) |
107+
| QUOTE | Sets the character which should be used for quoting values within the CSV file. | `"` (double quote) |
108+
| TERMINATOR | Sets the character which should be used as the line terminator within the CSV file. | None |
109+
| ESCAPE | Sets the character which should be used for escaping special characters within the CSV file. | None |
110+
| DOUBLE_QUOTE | Sets if quotes within quoted fields should be escaped by doubling them (e.g., `"aaa""bbb"`). | None |
111+
| NEWLINES_IN_VALUES | Sets if newlines in quoted values are supported. If not set, uses session or system default. | None |
112+
| DATE_FORMAT | Sets the format that dates should be encoded in within the CSV file. | None |
113+
| DATETIME_FORMAT | Sets the format that datetimes should be encoded in within the CSV file. | None |
114+
| TIMESTAMP_FORMAT | Sets the format that timestamps should be encoded in within the CSV file. | None |
115+
| TIMESTAMP_TZ_FORMAT | Sets the format that timestamps with timezone should be encoded in within the CSV file. | None |
116+
| TIME_FORMAT | Sets the format that times should be encoded in within the CSV file. | None |
117+
| NULL_VALUE | Sets the string which should be used to indicate null values within the CSV file. | None |
118+
| NULL_REGEX | Sets the regex pattern to match null values when loading CSVs. | None |
119+
| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer the schema. If set to 0, schema inference is disabled and all fields will be inferred as Utf8 (string) type. | None |
120+
| COMMENT | Sets the character which should be used to indicate comment lines in the CSV file. | None |
121121

122122
**Example:**
123123

0 commit comments

Comments
 (0)