Skip to content

Commit a1e2810

Browse files
authored
updated existing tests for split_recursively.rs to be based on public… (#1075)
* updated existing tests for split_recursively.rs to be based on public APIs * refactor the logic to build schema into a function and reuse and also refactor code a bit
1 parent 5a550dc commit a1e2810

File tree

1 file changed

+208
-114
lines changed

1 file changed

+208
-114
lines changed

src/ops/functions/split_recursively.rs

Lines changed: 208 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -939,55 +939,13 @@ pub fn register(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
939939
#[cfg(test)]
940940
mod tests {
941941
use super::*;
942-
use crate::ops::{functions::test_utils::test_flow_function, shared::split::OutputPosition};
943-
944-
// Helper function to assert chunk text and its consistency with the range within the original text.
945-
fn assert_chunk_text_consistency(
946-
full_text: &str, // Added full text
947-
actual_chunk: &ChunkOutput<'_>,
948-
expected_text: &str,
949-
context: &str,
950-
) {
951-
// Extract text using the chunk's range from the original full text.
952-
let extracted_text = full_text
953-
.get(actual_chunk.start_pos.byte_offset..actual_chunk.end_pos.byte_offset)
954-
.unwrap();
955-
// Assert that the expected text matches the text provided in the chunk.
956-
assert_eq!(
957-
actual_chunk.text, expected_text,
958-
"Provided chunk text mismatch - {context}"
959-
);
960-
// Assert that the expected text also matches the text extracted using the chunk's range.
961-
assert_eq!(
962-
extracted_text, expected_text,
963-
"Range inconsistency: extracted text mismatch - {context}"
964-
);
965-
}
966-
967-
// Creates a default RecursiveChunker for testing, assuming no language-specific parsing.
968-
fn create_test_chunker<'a>(
969-
text: &'a str,
970-
chunk_size: usize,
971-
min_chunk_size: usize,
972-
chunk_overlap: usize,
973-
) -> RecursiveChunker<'a> {
974-
RecursiveChunker {
975-
full_text: text,
976-
chunk_size,
977-
chunk_overlap,
978-
min_chunk_size,
979-
}
980-
}
981-
982-
#[tokio::test]
983-
async fn test_split_recursively() {
984-
let spec = Spec {
985-
custom_languages: vec![],
986-
};
987-
let factory = Arc::new(Factory);
988-
let text_content = "Linea 1.\nLinea 2.\n\nLinea 3.";
942+
use crate::ops::functions::test_utils::test_flow_function;
943+
use crate::ops::sdk::{BasicValueType, KeyValue, RangeValue, make_output_type};
944+
use crate::ops::shared::split::OutputPosition;
989945

990-
let input_arg_schemas = &[
946+
// Helper function to build the standard input argument schemas for split_recursively tests
947+
fn build_split_recursively_arg_schemas() -> Vec<(Option<&'static str>, EnrichedValueType)> {
948+
vec![
991949
(
992950
Some("text"),
993951
make_output_type(BasicValueType::Str).with_nullable(true),
@@ -1008,7 +966,17 @@ mod tests {
1008966
Some("language"),
1009967
make_output_type(BasicValueType::Str).with_nullable(true),
1010968
),
1011-
];
969+
]
970+
}
971+
972+
#[tokio::test]
973+
async fn test_split_recursively() {
974+
let spec = Spec {
975+
custom_languages: vec![],
976+
};
977+
let factory = Arc::new(Factory);
978+
let text_content = "Linea 1.\nLinea 2.\n\nLinea 3.";
979+
let input_arg_schemas = &build_split_recursively_arg_schemas();
1012980

1013981
{
1014982
let result = test_flow_function(
@@ -1168,88 +1136,214 @@ mod tests {
11681136
);
11691137
}
11701138

1171-
#[test]
1172-
fn test_basic_split_no_overlap() {
1139+
#[tokio::test]
1140+
async fn test_basic_split_no_overlap() {
1141+
let spec = Spec {
1142+
custom_languages: vec![],
1143+
};
1144+
let factory = Arc::new(Factory);
11731145
let text = "Linea 1.\nLinea 2.\n\nLinea 3.";
1174-
let chunker = create_test_chunker(text, 15, 5, 0);
1175-
1176-
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
1177-
lang_config: &DEFAULT_LANGUAGE_CONFIG,
1178-
next_regexp_sep_id: 0,
1179-
});
1146+
let input_arg_schemas = &build_split_recursively_arg_schemas();
11801147

1181-
assert!(result.is_ok());
1182-
let chunks = result.unwrap();
1148+
{
1149+
let result = test_flow_function(
1150+
&factory,
1151+
&spec,
1152+
input_arg_schemas,
1153+
vec![
1154+
text.to_string().into(),
1155+
(15i64).into(),
1156+
(5i64).into(),
1157+
(0i64).into(),
1158+
Value::Null,
1159+
],
1160+
)
1161+
.await;
1162+
assert!(
1163+
result.is_ok(),
1164+
"test_flow_function failed: {:?}",
1165+
result.err()
1166+
);
1167+
let value = result.unwrap();
1168+
match value {
1169+
Value::KTable(table) => {
1170+
let expected_chunks = vec![
1171+
(RangeValue::new(0, 8), "Linea 1."),
1172+
(RangeValue::new(9, 17), "Linea 2."),
1173+
(RangeValue::new(19, 27), "Linea 3."),
1174+
];
11831175

1184-
assert_eq!(chunks.len(), 3);
1185-
assert_chunk_text_consistency(text, &chunks[0], "Linea 1.", "Test 1, Chunk 0");
1186-
assert_chunk_text_consistency(text, &chunks[1], "Linea 2.", "Test 1, Chunk 1");
1187-
assert_chunk_text_consistency(text, &chunks[2], "Linea 3.", "Test 1, Chunk 2");
1176+
for (range, expected_text) in expected_chunks {
1177+
let key = KeyValue::from_single_part(range);
1178+
match table.get(&key) {
1179+
Some(scope_value_ref) => {
1180+
let chunk_text =
1181+
scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
1182+
panic!("Chunk text not a string for key {key:?}")
1183+
});
1184+
assert_eq!(**chunk_text, *expected_text);
1185+
}
1186+
None => panic!("Expected row value for key {key:?}, not found"),
1187+
}
1188+
}
1189+
}
1190+
other => panic!("Expected Value::KTable, got {other:?}"),
1191+
}
1192+
}
11881193

11891194
// Test splitting when chunk_size forces breaks within segments.
11901195
let text2 = "A very very long text that needs to be split.";
1191-
let chunker2 = create_test_chunker(text2, 20, 12, 0);
1192-
let result2 = chunker2.split_root_chunk(ChunkKind::RegexpSepChunk {
1193-
lang_config: &DEFAULT_LANGUAGE_CONFIG,
1194-
next_regexp_sep_id: 0,
1195-
});
1196-
1197-
assert!(result2.is_ok());
1198-
let chunks2 = result2.unwrap();
1199-
1200-
// Expect multiple chunks, likely split by spaces due to chunk_size.
1201-
assert!(chunks2.len() > 1);
1202-
assert_chunk_text_consistency(text2, &chunks2[0], "A very very long", "Test 2, Chunk 0");
1203-
assert!(chunks2[0].text.len() <= 20);
1196+
{
1197+
let result = test_flow_function(
1198+
&factory,
1199+
&spec,
1200+
input_arg_schemas,
1201+
vec![
1202+
text2.to_string().into(),
1203+
(20i64).into(),
1204+
(12i64).into(),
1205+
(0i64).into(),
1206+
Value::Null,
1207+
],
1208+
)
1209+
.await;
1210+
assert!(
1211+
result.is_ok(),
1212+
"test_flow_function failed: {:?}",
1213+
result.err()
1214+
);
1215+
let value = result.unwrap();
1216+
match value {
1217+
Value::KTable(table) => {
1218+
// Expect multiple chunks, likely split by spaces due to chunk_size.
1219+
assert!(table.len() > 1);
1220+
1221+
let key = KeyValue::from_single_part(RangeValue::new(0, 16));
1222+
match table.get(&key) {
1223+
Some(scope_value_ref) => {
1224+
let chunk_text =
1225+
scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
1226+
panic!("Chunk text not a string for key {key:?}")
1227+
});
1228+
assert_eq!(&**chunk_text, "A very very long");
1229+
assert!(chunk_text.len() <= 20);
1230+
}
1231+
None => panic!("Expected row value for key {key:?}, not found"),
1232+
}
1233+
}
1234+
other => panic!("Expected Value::KTable, got {other:?}"),
1235+
}
1236+
}
12041237
}
12051238

1206-
#[test]
1207-
fn test_basic_split_with_overlap() {
1239+
#[tokio::test]
1240+
async fn test_basic_split_with_overlap() {
1241+
let spec = Spec {
1242+
custom_languages: vec![],
1243+
};
1244+
let factory = Arc::new(Factory);
12081245
let text = "This is a test text that is a bit longer to see how the overlap works.";
1209-
let chunker = create_test_chunker(text, 20, 10, 5);
1210-
1211-
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
1212-
lang_config: &DEFAULT_LANGUAGE_CONFIG,
1213-
next_regexp_sep_id: 0,
1214-
});
1215-
1216-
assert!(result.is_ok());
1217-
let chunks = result.unwrap();
1246+
let input_arg_schemas = &build_split_recursively_arg_schemas();
12181247

1219-
assert!(chunks.len() > 1);
1248+
{
1249+
let result = test_flow_function(
1250+
&factory,
1251+
&spec,
1252+
input_arg_schemas,
1253+
vec![
1254+
text.to_string().into(),
1255+
(20i64).into(),
1256+
(10i64).into(),
1257+
(5i64).into(),
1258+
Value::Null,
1259+
],
1260+
)
1261+
.await;
1262+
assert!(
1263+
result.is_ok(),
1264+
"test_flow_function failed: {:?}",
1265+
result.err()
1266+
);
1267+
let value = result.unwrap();
1268+
match value {
1269+
Value::KTable(table) => {
1270+
assert!(table.len() > 1);
12201271

1221-
if chunks.len() >= 2 {
1222-
assert!(chunks[0].text.len() <= 25);
1272+
// Check first chunk length
1273+
if table.len() >= 2 {
1274+
let first_key = table.keys().next().unwrap();
1275+
match table.get(first_key) {
1276+
Some(scope_value_ref) => {
1277+
let chunk_text =
1278+
scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
1279+
panic!("Chunk text not a string for key {first_key:?}")
1280+
});
1281+
assert!(chunk_text.len() <= 25);
1282+
}
1283+
None => panic!("Expected row value for first key, not found"),
1284+
}
1285+
}
1286+
}
1287+
other => panic!("Expected Value::KTable, got {other:?}"),
1288+
}
12231289
}
12241290
}
12251291

1226-
#[test]
1227-
fn test_split_trims_whitespace() {
1292+
#[tokio::test]
1293+
async fn test_split_trims_whitespace() {
1294+
let spec = Spec {
1295+
custom_languages: vec![],
1296+
};
1297+
let factory = Arc::new(Factory);
12281298
let text = " \n First chunk. \n\n Second chunk with spaces at the end. \n";
1229-
let chunker = create_test_chunker(text, 30, 10, 0);
1230-
1231-
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
1232-
lang_config: &DEFAULT_LANGUAGE_CONFIG,
1233-
next_regexp_sep_id: 0,
1234-
});
1299+
let input_arg_schemas = &build_split_recursively_arg_schemas();
12351300

1236-
assert!(result.is_ok());
1237-
let chunks = result.unwrap();
1301+
{
1302+
let result = test_flow_function(
1303+
&factory,
1304+
&spec,
1305+
input_arg_schemas,
1306+
vec![
1307+
text.to_string().into(),
1308+
(30i64).into(),
1309+
(10i64).into(),
1310+
(0i64).into(),
1311+
Value::Null,
1312+
],
1313+
)
1314+
.await;
1315+
assert!(
1316+
result.is_ok(),
1317+
"test_flow_function failed: {:?}",
1318+
result.err()
1319+
);
1320+
let value = result.unwrap();
1321+
match value {
1322+
Value::KTable(table) => {
1323+
assert_eq!(table.len(), 3);
12381324

1239-
assert_eq!(chunks.len(), 3);
1325+
let expected_chunks = vec![
1326+
(RangeValue::new(3, 16), " First chunk."),
1327+
(RangeValue::new(19, 45), " Second chunk with spaces"),
1328+
(RangeValue::new(46, 57), "at the end."),
1329+
];
12401330

1241-
assert_chunk_text_consistency(
1242-
text,
1243-
&chunks[0],
1244-
" First chunk.",
1245-
"Whitespace Test, Chunk 0",
1246-
);
1247-
assert_chunk_text_consistency(
1248-
text,
1249-
&chunks[1],
1250-
" Second chunk with spaces",
1251-
"Whitespace Test, Chunk 1",
1252-
);
1253-
assert_chunk_text_consistency(text, &chunks[2], "at the end.", "Whitespace Test, Chunk 2");
1331+
for (range, expected_text) in expected_chunks {
1332+
let key = KeyValue::from_single_part(range);
1333+
match table.get(&key) {
1334+
Some(scope_value_ref) => {
1335+
let chunk_text =
1336+
scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
1337+
panic!("Chunk text not a string for key {key:?}")
1338+
});
1339+
assert_eq!(**chunk_text, *expected_text);
1340+
}
1341+
None => panic!("Expected row value for key {key:?}, not found"),
1342+
}
1343+
}
1344+
}
1345+
other => panic!("Expected Value::KTable, got {other:?}"),
1346+
}
1347+
}
12541348
}
12551349
}

0 commit comments

Comments
 (0)