@@ -939,55 +939,13 @@ pub fn register(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
939939#[ cfg( test) ]
940940mod tests {
941941 use super :: * ;
942- use crate :: ops:: { functions:: test_utils:: test_flow_function, shared:: split:: OutputPosition } ;
943-
944- // Helper function to assert chunk text and its consistency with the range within the original text.
945- fn assert_chunk_text_consistency (
946- full_text : & str , // Added full text
947- actual_chunk : & ChunkOutput < ' _ > ,
948- expected_text : & str ,
949- context : & str ,
950- ) {
951- // Extract text using the chunk's range from the original full text.
952- let extracted_text = full_text
953- . get ( actual_chunk. start_pos . byte_offset ..actual_chunk. end_pos . byte_offset )
954- . unwrap ( ) ;
955- // Assert that the expected text matches the text provided in the chunk.
956- assert_eq ! (
957- actual_chunk. text, expected_text,
958- "Provided chunk text mismatch - {context}"
959- ) ;
960- // Assert that the expected text also matches the text extracted using the chunk's range.
961- assert_eq ! (
962- extracted_text, expected_text,
963- "Range inconsistency: extracted text mismatch - {context}"
964- ) ;
965- }
966-
967- // Creates a default RecursiveChunker for testing, assuming no language-specific parsing.
968- fn create_test_chunker < ' a > (
969- text : & ' a str ,
970- chunk_size : usize ,
971- min_chunk_size : usize ,
972- chunk_overlap : usize ,
973- ) -> RecursiveChunker < ' a > {
974- RecursiveChunker {
975- full_text : text,
976- chunk_size,
977- chunk_overlap,
978- min_chunk_size,
979- }
980- }
981-
982- #[ tokio:: test]
983- async fn test_split_recursively ( ) {
984- let spec = Spec {
985- custom_languages : vec ! [ ] ,
986- } ;
987- let factory = Arc :: new ( Factory ) ;
988- let text_content = "Linea 1.\n Linea 2.\n \n Linea 3." ;
942+ use crate :: ops:: functions:: test_utils:: test_flow_function;
943+ use crate :: ops:: sdk:: { BasicValueType , KeyValue , RangeValue , make_output_type} ;
944+ use crate :: ops:: shared:: split:: OutputPosition ;
989945
990- let input_arg_schemas = & [
946+ // Helper function to build the standard input argument schemas for split_recursively tests
947+ fn build_split_recursively_arg_schemas ( ) -> Vec < ( Option < & ' static str > , EnrichedValueType ) > {
948+ vec ! [
991949 (
992950 Some ( "text" ) ,
993951 make_output_type( BasicValueType :: Str ) . with_nullable( true ) ,
@@ -1008,7 +966,17 @@ mod tests {
1008966 Some ( "language" ) ,
1009967 make_output_type( BasicValueType :: Str ) . with_nullable( true ) ,
1010968 ) ,
1011- ] ;
969+ ]
970+ }
971+
972+ #[ tokio:: test]
973+ async fn test_split_recursively ( ) {
974+ let spec = Spec {
975+ custom_languages : vec ! [ ] ,
976+ } ;
977+ let factory = Arc :: new ( Factory ) ;
978+ let text_content = "Linea 1.\n Linea 2.\n \n Linea 3." ;
979+ let input_arg_schemas = & build_split_recursively_arg_schemas ( ) ;
1012980
1013981 {
1014982 let result = test_flow_function (
@@ -1168,88 +1136,214 @@ mod tests {
11681136 ) ;
11691137 }
11701138
1171- #[ test]
1172- fn test_basic_split_no_overlap ( ) {
1139+ #[ tokio:: test]
1140+ async fn test_basic_split_no_overlap ( ) {
1141+ let spec = Spec {
1142+ custom_languages : vec ! [ ] ,
1143+ } ;
1144+ let factory = Arc :: new ( Factory ) ;
11731145 let text = "Linea 1.\n Linea 2.\n \n Linea 3." ;
1174- let chunker = create_test_chunker ( text, 15 , 5 , 0 ) ;
1175-
1176- let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk {
1177- lang_config : & DEFAULT_LANGUAGE_CONFIG ,
1178- next_regexp_sep_id : 0 ,
1179- } ) ;
1146+ let input_arg_schemas = & build_split_recursively_arg_schemas ( ) ;
11801147
1181- assert ! ( result. is_ok( ) ) ;
1182- let chunks = result. unwrap ( ) ;
1148+ {
1149+ let result = test_flow_function (
1150+ & factory,
1151+ & spec,
1152+ input_arg_schemas,
1153+ vec ! [
1154+ text. to_string( ) . into( ) ,
1155+ ( 15i64 ) . into( ) ,
1156+ ( 5i64 ) . into( ) ,
1157+ ( 0i64 ) . into( ) ,
1158+ Value :: Null ,
1159+ ] ,
1160+ )
1161+ . await ;
1162+ assert ! (
1163+ result. is_ok( ) ,
1164+ "test_flow_function failed: {:?}" ,
1165+ result. err( )
1166+ ) ;
1167+ let value = result. unwrap ( ) ;
1168+ match value {
1169+ Value :: KTable ( table) => {
1170+ let expected_chunks = vec ! [
1171+ ( RangeValue :: new( 0 , 8 ) , "Linea 1." ) ,
1172+ ( RangeValue :: new( 9 , 17 ) , "Linea 2." ) ,
1173+ ( RangeValue :: new( 19 , 27 ) , "Linea 3." ) ,
1174+ ] ;
11831175
1184- assert_eq ! ( chunks. len( ) , 3 ) ;
1185- assert_chunk_text_consistency ( text, & chunks[ 0 ] , "Linea 1." , "Test 1, Chunk 0" ) ;
1186- assert_chunk_text_consistency ( text, & chunks[ 1 ] , "Linea 2." , "Test 1, Chunk 1" ) ;
1187- assert_chunk_text_consistency ( text, & chunks[ 2 ] , "Linea 3." , "Test 1, Chunk 2" ) ;
1176+ for ( range, expected_text) in expected_chunks {
1177+ let key = KeyValue :: from_single_part ( range) ;
1178+ match table. get ( & key) {
1179+ Some ( scope_value_ref) => {
1180+ let chunk_text =
1181+ scope_value_ref. 0 . fields [ 0 ] . as_str ( ) . unwrap_or_else ( |_| {
1182+ panic ! ( "Chunk text not a string for key {key:?}" )
1183+ } ) ;
1184+ assert_eq ! ( * * chunk_text, * expected_text) ;
1185+ }
1186+ None => panic ! ( "Expected row value for key {key:?}, not found" ) ,
1187+ }
1188+ }
1189+ }
1190+ other => panic ! ( "Expected Value::KTable, got {other:?}" ) ,
1191+ }
1192+ }
11881193
11891194 // Test splitting when chunk_size forces breaks within segments.
11901195 let text2 = "A very very long text that needs to be split." ;
1191- let chunker2 = create_test_chunker ( text2, 20 , 12 , 0 ) ;
1192- let result2 = chunker2. split_root_chunk ( ChunkKind :: RegexpSepChunk {
1193- lang_config : & DEFAULT_LANGUAGE_CONFIG ,
1194- next_regexp_sep_id : 0 ,
1195- } ) ;
1196-
1197- assert ! ( result2. is_ok( ) ) ;
1198- let chunks2 = result2. unwrap ( ) ;
1199-
1200- // Expect multiple chunks, likely split by spaces due to chunk_size.
1201- assert ! ( chunks2. len( ) > 1 ) ;
1202- assert_chunk_text_consistency ( text2, & chunks2[ 0 ] , "A very very long" , "Test 2, Chunk 0" ) ;
1203- assert ! ( chunks2[ 0 ] . text. len( ) <= 20 ) ;
1196+ {
1197+ let result = test_flow_function (
1198+ & factory,
1199+ & spec,
1200+ input_arg_schemas,
1201+ vec ! [
1202+ text2. to_string( ) . into( ) ,
1203+ ( 20i64 ) . into( ) ,
1204+ ( 12i64 ) . into( ) ,
1205+ ( 0i64 ) . into( ) ,
1206+ Value :: Null ,
1207+ ] ,
1208+ )
1209+ . await ;
1210+ assert ! (
1211+ result. is_ok( ) ,
1212+ "test_flow_function failed: {:?}" ,
1213+ result. err( )
1214+ ) ;
1215+ let value = result. unwrap ( ) ;
1216+ match value {
1217+ Value :: KTable ( table) => {
1218+ // Expect multiple chunks, likely split by spaces due to chunk_size.
1219+ assert ! ( table. len( ) > 1 ) ;
1220+
1221+ let key = KeyValue :: from_single_part ( RangeValue :: new ( 0 , 16 ) ) ;
1222+ match table. get ( & key) {
1223+ Some ( scope_value_ref) => {
1224+ let chunk_text =
1225+ scope_value_ref. 0 . fields [ 0 ] . as_str ( ) . unwrap_or_else ( |_| {
1226+ panic ! ( "Chunk text not a string for key {key:?}" )
1227+ } ) ;
1228+ assert_eq ! ( & * * chunk_text, "A very very long" ) ;
1229+ assert ! ( chunk_text. len( ) <= 20 ) ;
1230+ }
1231+ None => panic ! ( "Expected row value for key {key:?}, not found" ) ,
1232+ }
1233+ }
1234+ other => panic ! ( "Expected Value::KTable, got {other:?}" ) ,
1235+ }
1236+ }
12041237 }
12051238
1206- #[ test]
1207- fn test_basic_split_with_overlap ( ) {
1239+ #[ tokio:: test]
1240+ async fn test_basic_split_with_overlap ( ) {
1241+ let spec = Spec {
1242+ custom_languages : vec ! [ ] ,
1243+ } ;
1244+ let factory = Arc :: new ( Factory ) ;
12081245 let text = "This is a test text that is a bit longer to see how the overlap works." ;
1209- let chunker = create_test_chunker ( text, 20 , 10 , 5 ) ;
1210-
1211- let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk {
1212- lang_config : & DEFAULT_LANGUAGE_CONFIG ,
1213- next_regexp_sep_id : 0 ,
1214- } ) ;
1215-
1216- assert ! ( result. is_ok( ) ) ;
1217- let chunks = result. unwrap ( ) ;
1246+ let input_arg_schemas = & build_split_recursively_arg_schemas ( ) ;
12181247
1219- assert ! ( chunks. len( ) > 1 ) ;
1248+ {
1249+ let result = test_flow_function (
1250+ & factory,
1251+ & spec,
1252+ input_arg_schemas,
1253+ vec ! [
1254+ text. to_string( ) . into( ) ,
1255+ ( 20i64 ) . into( ) ,
1256+ ( 10i64 ) . into( ) ,
1257+ ( 5i64 ) . into( ) ,
1258+ Value :: Null ,
1259+ ] ,
1260+ )
1261+ . await ;
1262+ assert ! (
1263+ result. is_ok( ) ,
1264+ "test_flow_function failed: {:?}" ,
1265+ result. err( )
1266+ ) ;
1267+ let value = result. unwrap ( ) ;
1268+ match value {
1269+ Value :: KTable ( table) => {
1270+ assert ! ( table. len( ) > 1 ) ;
12201271
1221- if chunks. len ( ) >= 2 {
1222- assert ! ( chunks[ 0 ] . text. len( ) <= 25 ) ;
1272+ // Check first chunk length
1273+ if table. len ( ) >= 2 {
1274+ let first_key = table. keys ( ) . next ( ) . unwrap ( ) ;
1275+ match table. get ( first_key) {
1276+ Some ( scope_value_ref) => {
1277+ let chunk_text =
1278+ scope_value_ref. 0 . fields [ 0 ] . as_str ( ) . unwrap_or_else ( |_| {
1279+ panic ! ( "Chunk text not a string for key {first_key:?}" )
1280+ } ) ;
1281+ assert ! ( chunk_text. len( ) <= 25 ) ;
1282+ }
1283+ None => panic ! ( "Expected row value for first key, not found" ) ,
1284+ }
1285+ }
1286+ }
1287+ other => panic ! ( "Expected Value::KTable, got {other:?}" ) ,
1288+ }
12231289 }
12241290 }
12251291
1226- #[ test]
1227- fn test_split_trims_whitespace ( ) {
1292+ #[ tokio:: test]
1293+ async fn test_split_trims_whitespace ( ) {
1294+ let spec = Spec {
1295+ custom_languages : vec ! [ ] ,
1296+ } ;
1297+ let factory = Arc :: new ( Factory ) ;
12281298 let text = " \n First chunk. \n \n Second chunk with spaces at the end. \n " ;
1229- let chunker = create_test_chunker ( text, 30 , 10 , 0 ) ;
1230-
1231- let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk {
1232- lang_config : & DEFAULT_LANGUAGE_CONFIG ,
1233- next_regexp_sep_id : 0 ,
1234- } ) ;
1299+ let input_arg_schemas = & build_split_recursively_arg_schemas ( ) ;
12351300
1236- assert ! ( result. is_ok( ) ) ;
1237- let chunks = result. unwrap ( ) ;
1301+ {
1302+ let result = test_flow_function (
1303+ & factory,
1304+ & spec,
1305+ input_arg_schemas,
1306+ vec ! [
1307+ text. to_string( ) . into( ) ,
1308+ ( 30i64 ) . into( ) ,
1309+ ( 10i64 ) . into( ) ,
1310+ ( 0i64 ) . into( ) ,
1311+ Value :: Null ,
1312+ ] ,
1313+ )
1314+ . await ;
1315+ assert ! (
1316+ result. is_ok( ) ,
1317+ "test_flow_function failed: {:?}" ,
1318+ result. err( )
1319+ ) ;
1320+ let value = result. unwrap ( ) ;
1321+ match value {
1322+ Value :: KTable ( table) => {
1323+ assert_eq ! ( table. len( ) , 3 ) ;
12381324
1239- assert_eq ! ( chunks. len( ) , 3 ) ;
1325+ let expected_chunks = vec ! [
1326+ ( RangeValue :: new( 3 , 16 ) , " First chunk." ) ,
1327+ ( RangeValue :: new( 19 , 45 ) , " Second chunk with spaces" ) ,
1328+ ( RangeValue :: new( 46 , 57 ) , "at the end." ) ,
1329+ ] ;
12401330
1241- assert_chunk_text_consistency (
1242- text,
1243- & chunks[ 0 ] ,
1244- " First chunk." ,
1245- "Whitespace Test, Chunk 0" ,
1246- ) ;
1247- assert_chunk_text_consistency (
1248- text,
1249- & chunks[ 1 ] ,
1250- " Second chunk with spaces" ,
1251- "Whitespace Test, Chunk 1" ,
1252- ) ;
1253- assert_chunk_text_consistency ( text, & chunks[ 2 ] , "at the end." , "Whitespace Test, Chunk 2" ) ;
1331+ for ( range, expected_text) in expected_chunks {
1332+ let key = KeyValue :: from_single_part ( range) ;
1333+ match table. get ( & key) {
1334+ Some ( scope_value_ref) => {
1335+ let chunk_text =
1336+ scope_value_ref. 0 . fields [ 0 ] . as_str ( ) . unwrap_or_else ( |_| {
1337+ panic ! ( "Chunk text not a string for key {key:?}" )
1338+ } ) ;
1339+ assert_eq ! ( * * chunk_text, * expected_text) ;
1340+ }
1341+ None => panic ! ( "Expected row value for key {key:?}, not found" ) ,
1342+ }
1343+ }
1344+ }
1345+ other => panic ! ( "Expected Value::KTable, got {other:?}" ) ,
1346+ }
1347+ }
12541348 }
12551349}
0 commit comments