refactor(tests): simplify test util using structured Spec instead of JSON

lemorage · lemorage · commit d347648ec6d7 · 2025-07-10T14:14:31.000+02:00
diff --git a/src/ops/functions/embed_text.rs b/src/ops/functions/embed_text.rs
@@ -100,22 +100,18 @@ pub fn register(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
 mod tests {
     use super::*;
     use crate::ops::functions::test_utils::{build_arg_schema, test_flow_function};
-    use serde_json::json;
 
     #[tokio::test]
     #[ignore = "This test requires OpenAI API key or a configured local LLM and may make network calls."]
-    async fn test_embed_text_with_util() {
-        let context = Arc::new(FlowInstanceContext {
-            flow_instance_name: "test_embed_text_flow".to_string(),
-            auth_registry: Arc::new(AuthRegistry::default()),
-            py_exec_ctx: None,
-        });
-
+    async fn test_embed_text() {
         // Using OpenAI as an example.
-        let spec_json = json!({
-            "api_type": "OpenAi",
-            "model": "text-embedding-ada-002",
-        });
+        let spec = Spec {
+            api_type: LlmApiType::OpenAi,
+            model: "text-embedding-ada-002".to_string(),
+            address: None,
+            output_dimension: None,
+            task_type: None,
+        };
 
         let factory = Arc::new(Factory);
         let text_content = "CocoIndex is a performant data transformation framework for AI.";
@@ -128,14 +124,7 @@ mod tests {
             BasicValueType::Str,
         )];
 
-        let result = test_flow_function(
-            factory,
-            spec_json,
-            input_arg_schemas,
-            input_args_values,
-            context,
-        )
-        .await;
+        let result = test_flow_function(factory, spec, input_arg_schemas, input_args_values).await;
 
         if result.is_err() {
             eprintln!(
diff --git a/src/ops/functions/extract_by_llm.rs b/src/ops/functions/extract_by_llm.rs
@@ -160,17 +160,10 @@ impl SimpleFunctionFactoryBase for Factory {
 mod tests {
     use super::*;
     use crate::ops::functions::test_utils::{build_arg_schema, test_flow_function};
-    use serde_json::json;
 
     #[tokio::test]
     #[ignore = "This test requires an OpenAI API key or a configured local LLM and may make network calls."]
-    async fn test_extract_by_llm_with_util() {
-        let context = Arc::new(FlowInstanceContext {
-            flow_instance_name: "test_extract_by_llm_flow".to_string(),
-            auth_registry: Arc::new(AuthRegistry::default()),
-            py_exec_ctx: None,
-        });
-
+    async fn test_extract_by_llm() {
         // Define the expected output structure
         let target_output_schema = StructSchema {
             fields: Arc::new(vec![
@@ -193,20 +186,15 @@ mod tests {
         };
 
         // Spec using OpenAI as an example.
-        let spec_json = json!({
-            "llm_spec": {
-                "api_type": "OpenAi",
-                "model": "gpt-4o",
-                "address": null,
-                "api_key_auth": null,
-                "max_tokens": 100,
-                "temperature": 0.0,
-                "top_p": null,
-                "params": {}
+        let spec = Spec {
+            llm_spec: LlmSpec {
+                api_type: crate::llm::LlmApiType::OpenAi,
+                model: "gpt-4o".to_string(),
+                address: None,
             },
-            "output_type": output_type_spec,
-            "instruction": "Extract the name and value from the text. The name is a string, the value is an integer."
-        });
+            output_type: output_type_spec,
+            instruction: Some("Extract the name and value from the text. The name is a string, the value is an integer.".to_string()),
+        };
 
         let factory = Arc::new(Factory);
         let text_content = "The item is called 'CocoIndex Test' and its value is 42.";
@@ -219,14 +207,7 @@ mod tests {
             BasicValueType::Str,
         )];
 
-        let result = test_flow_function(
-            factory,
-            spec_json,
-            input_arg_schemas,
-            input_args_values,
-            context,
-        )
-        .await;
+        let result = test_flow_function(factory, spec, input_arg_schemas, input_args_values).await;
 
         if result.is_err() {
             eprintln!(
diff --git a/src/ops/functions/parse_json.rs b/src/ops/functions/parse_json.rs
@@ -110,14 +110,8 @@ mod tests {
     use serde_json::json;
 
     #[tokio::test]
-    async fn test_parse_json_util() {
-        let context = Arc::new(FlowInstanceContext {
-            flow_instance_name: "test_parse_json_flow".to_string(),
-            auth_registry: Arc::new(AuthRegistry::default()),
-            py_exec_ctx: None,
-        });
-
-        let spec_json = json!({});
+    async fn test_parse_json() {
+        let spec = EmptySpec {};
 
         let factory = Arc::new(Factory);
         let json_string_content = r#"{"city": "Magdeburg"}"#;
@@ -134,14 +128,7 @@ mod tests {
             build_arg_schema("language", lang_value, BasicValueType::Str),
         ];
 
-        let result = test_flow_function(
-            factory,
-            spec_json,
-            input_arg_schemas,
-            input_args_values,
-            context,
-        )
-        .await;
+        let result = test_flow_function(factory, spec, input_arg_schemas, input_args_values).await;
 
         assert!(
             result.is_ok(),
diff --git a/src/ops/functions/split_recursively.rs b/src/ops/functions/split_recursively.rs
@@ -1032,7 +1032,6 @@ pub fn register(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
 mod tests {
     use super::*;
     use crate::ops::functions::test_utils::{build_arg_schema, test_flow_function};
-    use serde_json::json;
 
     // Helper function to assert chunk text and its consistency with the range within the original text.
     fn assert_chunk_text_consistency(
@@ -1075,14 +1074,10 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_split_recursively_with_util() {
-        let context = Arc::new(FlowInstanceContext {
-            flow_instance_name: "test_parse_recursively_flow".to_string(),
-            auth_registry: Arc::new(AuthRegistry::default()),
-            py_exec_ctx: None,
-        });
-
-        let spec_json = json!({});
+    async fn test_split_recursively() {
+        let spec = Spec {
+            custom_languages: vec![],
+        };
         let factory = Arc::new(Factory);
         let text_content = "Linea 1.\nLinea 2.\n\nLinea 3.";
 
@@ -1102,14 +1097,7 @@ mod tests {
             build_arg_schema("language", Value::Null, BasicValueType::Str),
         ];
 
-        let result = test_flow_function(
-            factory,
-            spec_json,
-            input_arg_schemas,
-            input_args_values,
-            context,
-        )
-        .await;
+        let result = test_flow_function(factory, spec, input_arg_schemas, input_args_values).await;
 
         assert!(
             result.is_ok(),
@@ -1250,6 +1238,7 @@ mod tests {
         assert_chunk_text_consistency(text2, &chunks2[0], "A very very long", "Test 2, Chunk 0");
         assert!(chunks2[0].text.len() <= 20);
     }
+
     #[test]
     fn test_basic_split_with_overlap() {
         let text = "This is a test text that is a bit longer to see how the overlap works.";
@@ -1269,6 +1258,7 @@ mod tests {
             assert!(chunks[0].text.len() <= 25);
         }
     }
+
     #[test]
     fn test_split_trims_whitespace() {
         let text = "  \n First chunk. \n\n  Second chunk with spaces at the end.   \n";
diff --git a/src/ops/functions/test_utils.rs b/src/ops/functions/test_utils.rs
@@ -1,11 +1,10 @@
 use crate::builder::plan::AnalyzedValueMapping;
 use crate::ops::sdk::{
-    BasicValueType, EnrichedValueType, FlowInstanceContext, OpArgSchema, OpArgsResolver,
-    SimpleFunctionExecutor, SimpleFunctionFactoryBase, Value, make_output_type,
+    AuthRegistry, BasicValueType, EnrichedValueType, FlowInstanceContext, OpArgSchema,
+    OpArgsResolver, SimpleFunctionExecutor, SimpleFunctionFactoryBase, Value, make_output_type,
 };
 use anyhow::Result;
 use serde::de::DeserializeOwned;
-use serde_json::Value as JsonValue;
 use std::sync::Arc;
 
 fn new_literal_op_arg_schema(
@@ -30,18 +29,20 @@ pub fn build_arg_schema(name: &str, value: Value, value_type: BasicValueType) ->
 // This function tests a flow function by providing a spec, input argument schemas, and values.
 pub async fn test_flow_function<S, R, F>(
     factory: Arc<F>,
-    spec_json: JsonValue,
+    spec: S,
     input_arg_schemas: Vec<OpArgSchema>,
     input_arg_values: Vec<Value>,
-    context: Arc<FlowInstanceContext>,
 ) -> Result<Value>
 where
     S: DeserializeOwned + Send + Sync + 'static,
     R: Send + Sync + 'static,
     F: SimpleFunctionFactoryBase<Spec = S, ResolvedArgs = R> + ?Sized,
 {
-    // 1. Deserialize Spec
-    let spec: S = serde_json::from_value(spec_json)?;
+    let context = Arc::new(FlowInstanceContext {
+        flow_instance_name: "test_flow_function".to_string(),
+        auth_registry: Arc::new(AuthRegistry::default()),
+        py_exec_ctx: None,
+    });
 
     // 2. Resolve Schema & Args
     // The caller of test_flow_function will be responsible for creating these schemas.