fix: csv schema_infer_max_records set to 0 return null datatype (#19432)

haohuaijin · web-flow · commit 0bd880931e9e · 2025-12-24T01:01:25.000Z
## Which issue does this PR close?  - close #19417 ## Rationale for this change - see #19417 - related to #17796 ## What changes are included in this PR? when schema_infer_max_records set to 0 in csv, return datatype as string ## Are these changes tested? add test case for schema_infer_max_records equal to 0 ## Are there any user-facing changes?
diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
@@ -1536,4 +1536,32 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_infer_schema_with_zero_max_records() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        let root = format!("{}/csv", arrow_test_data());
+        let format = CsvFormat::default()
+            .with_has_header(true)
+            .with_schema_infer_max_rec(0); // Set to 0 to disable inference
+        let exec = scan_format(
+            &state,
+            &format,
+            None,
+            &root,
+            "aggregate_test_100.csv",
+            None,
+            None,
+        )
+        .await?;
+
+        // related to https://github.com/apache/datafusion/issues/19417
+        for f in exec.schema().fields() {
+            assert_eq!(*f.data_type(), DataType::Utf8);
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs
@@ -211,6 +211,11 @@ impl CsvFormat {
 
     /// Set a limit in terms of records to scan to infer the schema
     /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
+    ///
+    /// # Behavior when set to 0
+    ///
+    /// When `max_rec` is set to 0, schema inference is disabled and all fields
+    /// will be inferred as `Utf8` (string) type, regardless of their actual content.
     pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self {
         self.options.schema_infer_max_rec = Some(max_rec);
         self
@@ -529,6 +534,7 @@ impl CsvFormat {
         let mut column_names = vec![];
         let mut column_type_possibilities = vec![];
         let mut record_number = -1;
+        let initial_records_to_read = records_to_read;
 
         pin_mut!(stream);
 
@@ -619,12 +625,31 @@ impl CsvFormat {
             }
         }
 
-        let schema = build_schema_helper(column_names, column_type_possibilities);
+        let schema = build_schema_helper(
+            column_names,
+            column_type_possibilities,
+            initial_records_to_read == 0,
+        );
         Ok((schema, total_records_read))
     }
 }
 
-fn build_schema_helper(names: Vec<String>, types: Vec<HashSet<DataType>>) -> Schema {
+/// Builds a schema from column names and their possible data types.
+///
+/// # Arguments
+///
+/// * `names` - Vector of column names
+/// * `types` - Vector of possible data types for each column (as HashSets)
+/// * `disable_inference` - When true, forces all columns with no inferred types to be Utf8.
+///   This should be set to true when `schema_infer_max_rec` is explicitly
+///   set to 0, indicating the user wants to skip type inference and treat
+///   all fields as strings. When false, columns with no inferred types
+///   will be set to Null, allowing schema merging to work properly.
+fn build_schema_helper(
+    names: Vec<String>,
+    types: Vec<HashSet<DataType>>,
+    disable_inference: bool,
+) -> Schema {
     let fields = names
         .into_iter()
         .zip(types)
@@ -637,10 +662,17 @@ fn build_schema_helper(names: Vec<String>, types: Vec<HashSet<DataType>>) -> Sch
             data_type_possibilities.remove(&DataType::Null);
 
             match data_type_possibilities.len() {
-                // Return Null for columns with only nulls / empty files
-                // This allows schema merging to work when reading folders
-                // such files along with normal files.
-                0 => Field::new(field_name, DataType::Null, true),
+                // When no types were inferred (empty HashSet):
+                // - If schema_infer_max_rec was explicitly set to 0, return Utf8
+                // - Otherwise return Null (whether from reading null values or empty files)
+                //   This allows schema merging to work when reading folders with empty files
+                0 => {
+                    if disable_inference {
+                        Field::new(field_name, DataType::Utf8, true)
+                    } else {
+                        Field::new(field_name, DataType::Null, true)
+                    }
+                }
                 1 => Field::new(
                     field_name,
                     data_type_possibilities.iter().next().unwrap().clone(),
@@ -832,7 +864,7 @@ mod tests {
             HashSet::from([DataType::Utf8]), // col5
         ];
 
-        let schema = build_schema_helper(column_names, column_type_possibilities);
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
 
         // Verify schema has 5 columns
         assert_eq!(schema.fields().len(), 5);
@@ -862,7 +894,7 @@ mod tests {
             HashSet::from([DataType::Utf8]),                     // Should remain Utf8
         ];
 
-        let schema = build_schema_helper(column_names, column_type_possibilities);
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
 
         // col1 should be Float64 due to Int64 + Float64 = Float64
         assert_eq!(*schema.field(0).data_type(), DataType::Float64);
@@ -880,7 +912,7 @@ mod tests {
             HashSet::from([DataType::Boolean, DataType::Int64, DataType::Utf8]), // Should resolve to Utf8 due to conflicts
         ];
 
-        let schema = build_schema_helper(column_names, column_type_possibilities);
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
 
         // Should default to Utf8 for conflicting types
         assert_eq!(*schema.field(0).data_type(), DataType::Utf8);
diff --git a/docs/source/user-guide/sql/format_options.md b/docs/source/user-guide/sql/format_options.md
@@ -99,25 +99,25 @@ OPTIONS('COMPRESSION' 'gzip');
 
 The following options are available when reading or writing CSV files. Note: If any unsupported option is specified, an error will be raised and the query will fail.
 
-| Option               | Description                                                                                                                       | Default Value      |
-| -------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
-| COMPRESSION          | Sets the compression that should be applied to the entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED. | UNCOMPRESSED       |
-| HAS_HEADER           | Sets if the CSV file should include column headers. If not set, uses session or system default.                                   | None               |
-| DELIMITER            | Sets the character which should be used as the column delimiter within the CSV file.                                              | `,` (comma)        |
-| QUOTE                | Sets the character which should be used for quoting values within the CSV file.                                                   | `"` (double quote) |
-| TERMINATOR           | Sets the character which should be used as the line terminator within the CSV file.                                               | None               |
-| ESCAPE               | Sets the character which should be used for escaping special characters within the CSV file.                                      | None               |
-| DOUBLE_QUOTE         | Sets if quotes within quoted fields should be escaped by doubling them (e.g., `"aaa""bbb"`).                                      | None               |
-| NEWLINES_IN_VALUES   | Sets if newlines in quoted values are supported. If not set, uses session or system default.                                      | None               |
-| DATE_FORMAT          | Sets the format that dates should be encoded in within the CSV file.                                                              | None               |
-| DATETIME_FORMAT      | Sets the format that datetimes should be encoded in within the CSV file.                                                          | None               |
-| TIMESTAMP_FORMAT     | Sets the format that timestamps should be encoded in within the CSV file.                                                         | None               |
-| TIMESTAMP_TZ_FORMAT  | Sets the format that timestamps with timezone should be encoded in within the CSV file.                                           | None               |
-| TIME_FORMAT          | Sets the format that times should be encoded in within the CSV file.                                                              | None               |
-| NULL_VALUE           | Sets the string which should be used to indicate null values within the CSV file.                                                 | None               |
-| NULL_REGEX           | Sets the regex pattern to match null values when loading CSVs.                                                                    | None               |
-| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer the schema.                                                                   | None               |
-| COMMENT              | Sets the character which should be used to indicate comment lines in the CSV file.                                                | None               |
+| Option               | Description                                                                                                                                                      | Default Value      |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
+| COMPRESSION          | Sets the compression that should be applied to the entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED.                                | UNCOMPRESSED       |
+| HAS_HEADER           | Sets if the CSV file should include column headers. If not set, uses session or system default.                                                                  | None               |
+| DELIMITER            | Sets the character which should be used as the column delimiter within the CSV file.                                                                             | `,` (comma)        |
+| QUOTE                | Sets the character which should be used for quoting values within the CSV file.                                                                                  | `"` (double quote) |
+| TERMINATOR           | Sets the character which should be used as the line terminator within the CSV file.                                                                              | None               |
+| ESCAPE               | Sets the character which should be used for escaping special characters within the CSV file.                                                                     | None               |
+| DOUBLE_QUOTE         | Sets if quotes within quoted fields should be escaped by doubling them (e.g., `"aaa""bbb"`).                                                                     | None               |
+| NEWLINES_IN_VALUES   | Sets if newlines in quoted values are supported. If not set, uses session or system default.                                                                     | None               |
+| DATE_FORMAT          | Sets the format that dates should be encoded in within the CSV file.                                                                                             | None               |
+| DATETIME_FORMAT      | Sets the format that datetimes should be encoded in within the CSV file.                                                                                         | None               |
+| TIMESTAMP_FORMAT     | Sets the format that timestamps should be encoded in within the CSV file.                                                                                        | None               |
+| TIMESTAMP_TZ_FORMAT  | Sets the format that timestamps with timezone should be encoded in within the CSV file.                                                                          | None               |
+| TIME_FORMAT          | Sets the format that times should be encoded in within the CSV file.                                                                                             | None               |
+| NULL_VALUE           | Sets the string which should be used to indicate null values within the CSV file.                                                                                | None               |
+| NULL_REGEX           | Sets the regex pattern to match null values when loading CSVs.                                                                                                   | None               |
+| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer the schema. If set to 0, schema inference is disabled and all fields will be inferred as Utf8 (string) type. | None               |
+| COMMENT              | Sets the character which should be used to indicate comment lines in the CSV file.                                                                               | None               |
 
 **Example:**