fix: Eliminate endless busy looping in read_json_files on failed read (#1489)

OussamaSaoudi · web-flow · commit 743018b22094 · 2025-11-14T17:21:14.000-08:00
## What changes are proposed in this pull request? This PR ensures that at most one error is emitted for the Arrow Json Reader. In the past, this would endlessly produce error variants since it only terminates upon EOF. Link to the Arrow [`read` function](https://arrow.apache.org/rust/arrow_json/reader/struct.Reader.html#method.read) that shows it only terminates on an EOF. Fixes: #1050  ## How was this change tested? This takes the repro and ensures that shutting down the tokio runtime will never timeout. A tokio runtime timeout indicates that one of the threads never stops.
diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs
@@ -126,6 +126,7 @@ impl<E: TaskExecutor> JsonHandler for DefaultJsonHandler<E> {
             while let Some(item) = stream.next().await {
                 if tx.send(item).is_err() {
                     warn!("read_json receiver end of channel dropped before sending completed");
+                    break;
                 }
             }
         });
@@ -199,7 +200,19 @@ impl JsonOpener {
                 let reader = ReaderBuilder::new(schema)
                     .with_batch_size(batch_size)
                     .build(BufReader::new(file))?;
-                Ok(futures::stream::iter(reader).map_err(Error::from).boxed())
+
+                let mut seen_error = false;
+                Ok(futures::stream::iter(reader)
+                    .map_err(Error::from)
+                    .take_while(move |result| {
+                        // Emit exactly one error, then stop the stream. We check seen_error BEFORE
+                        // updating it so the first error passes through, but subsequent items don't.
+                        // This is necessary because Arrow's Reader loops the same error indefinitely.
+                        let return_this = !seen_error;
+                        seen_error = seen_error || result.is_err();
+                        futures::future::ready(return_this)
+                    })
+                    .boxed())
             }
             GetResultPayload::Stream(s) => {
                 let mut decoder = ReaderBuilder::new(schema)
@@ -271,6 +284,7 @@ mod tests {
         PutPayload, PutResult, Result,
     };
     use serde_json::json;
+    use tracing::info;
 
     // TODO: should just use the one from test_utils, but running into dependency issues
     fn into_record_batch(engine_data: Box<dyn EngineData>) -> RecordBatch {
@@ -623,6 +637,67 @@ mod tests {
         );
     }
 
+    use crate::engine::default::DefaultEngine;
+    use crate::schema::StructType;
+    use crate::Engine;
+    use std::io::Write;
+    use tempfile::NamedTempFile;
+
+    fn make_invalid_named_temp() -> (NamedTempFile, Url) {
+        let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
+        write!(temp_file, r#"this is not valid json"#).expect("Failed to write to temp file");
+        let path = temp_file.path();
+        let file_url = Url::from_file_path(path).expect("Failed to create file URL");
+
+        info!("Created temporary malformed file at: {file_url}");
+        (temp_file, file_url)
+    }
+
+    #[test]
+    fn test_read_invalid_json() -> Result<(), Box<dyn std::error::Error>> {
+        let _ = tracing_subscriber::fmt().try_init();
+        let (_temp_file1, file_url1) = make_invalid_named_temp();
+        let (_temp_file2, file_url2) = make_invalid_named_temp();
+        let field = StructField::nullable("name", crate::schema::DataType::BOOLEAN);
+        let schema = Arc::new(StructType::try_new(vec![field]).unwrap());
+        let default_engine = DefaultEngine::new(Arc::new(LocalFileSystem::new()));
+
+        // Helper to check that we get expected number of errors then stream ends
+        let check_errors = |file_urls: Vec<_>, expected_errors: usize| {
+            let file_vec: Vec<_> = file_urls
+                .into_iter()
+                .map(|url| FileMeta::new(url, 1, 1))
+                .collect();
+
+            let mut iter = default_engine
+                .json_handler()
+                .read_json_files(&file_vec, schema.clone(), None)
+                .unwrap();
+
+            for _ in 0..expected_errors {
+                assert!(
+                    iter.next().unwrap().is_err(),
+                    "Read succeeded unexpectedly. The JSON should have been invalid."
+                );
+            }
+
+            assert!(
+                iter.next().is_none(),
+                "The stream should end once the read result fails"
+            );
+        };
+
+        // CASE 1: Single failing file
+        info!("\nAttempting to read single malformed JSON file...");
+        check_errors(vec![file_url1.clone()], 1);
+
+        // CASE 2: Two failing files
+        info!("\nAttempting to read two malformed JSON files...");
+        check_errors(vec![file_url1, file_url2], 2);
+
+        Ok(())
+    }
+
     #[tokio::test(flavor = "multi_thread", worker_threads = 3)]
     async fn test_read_json_files_ordering() {
         // this test checks that the read_json_files method returns the files in order in the