Skip to content

Commit 670b606

Browse files
committed
feat(split): add line/column to output of SplitRecursively
1 parent 9a4c899 commit 670b606

File tree

1 file changed

+185
-48
lines changed

1 file changed

+185
-48
lines changed

src/ops/functions/split_recursively.rs

Lines changed: 185 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,42 @@ impl<'s> AtomChunksCollector<'s> {
464464
}
465465
}
466466

467+
#[derive(Debug, Clone, PartialEq, Eq)]
468+
struct OutputPosition {
469+
char_offset: usize,
470+
line: u32,
471+
column: u32,
472+
}
473+
474+
impl OutputPosition {
475+
fn into_output(self) -> value::Value {
476+
value::Value::Struct(fields_value!(
477+
self.char_offset as i64,
478+
self.line as i64,
479+
self.column as i64
480+
))
481+
}
482+
}
483+
struct Position {
484+
byte_offset: usize,
485+
output: Option<OutputPosition>,
486+
}
487+
488+
impl Position {
489+
fn new(byte_offset: usize) -> Self {
490+
Self {
491+
byte_offset,
492+
output: None,
493+
}
494+
}
495+
}
496+
497+
struct ChunkOutput<'s> {
498+
start_pos: Position,
499+
end_pos: Position,
500+
text: &'s str,
501+
}
502+
467503
struct RecursiveChunker<'s> {
468504
full_text: &'s str,
469505
chunk_size: usize,
@@ -551,7 +587,7 @@ impl<'t, 's: 't> RecursiveChunker<'s> {
551587
}
552588
}
553589

554-
fn merge_atom_chunks(&self, atom_chunks: Vec<AtomChunk>) -> Vec<(RangeValue, &'s str)> {
590+
fn merge_atom_chunks(&self, atom_chunks: Vec<AtomChunk>) -> Vec<ChunkOutput<'s>> {
555591
struct AtomRoutingPlan {
556592
start_idx: usize, // index of `atom_chunks` for the start chunk
557593
prev_plan_idx: usize, // index of `plans` for the previous plan
@@ -687,15 +723,18 @@ impl<'t, 's: 't> RecursiveChunker<'s> {
687723
let plan = &plans[plan_idx];
688724
let start_chunk = &atom_chunks[plan.start_idx];
689725
let end_chunk = &atom_chunks[plan_idx - 1];
690-
let range = RangeValue::new(start_chunk.range.start, end_chunk.range.end);
691-
output.push((range, &self.full_text[range.start..range.end]));
726+
output.push(ChunkOutput {
727+
start_pos: Position::new(start_chunk.range.start),
728+
end_pos: Position::new(end_chunk.range.end),
729+
text: &self.full_text[start_chunk.range.start..end_chunk.range.end],
730+
});
692731
plan_idx = plan.prev_plan_idx;
693732
}
694733
output.reverse();
695734
output
696735
}
697736

698-
fn split_root_chunk(&self, kind: ChunkKind<'t>) -> Result<Vec<(RangeValue, &'s str)>> {
737+
fn split_root_chunk(&self, kind: ChunkKind<'t>) -> Result<Vec<ChunkOutput<'s>>> {
699738
let mut atom_collector = AtomChunksCollector {
700739
full_text: self.full_text,
701740
min_level: 0,
@@ -769,34 +808,52 @@ impl Executor {
769808
}
770809
}
771810

772-
fn translate_bytes_to_chars<'a>(text: &str, offsets: impl Iterator<Item = &'a mut usize>) {
773-
let mut offsets = offsets.collect::<Vec<_>>();
774-
offsets.sort_by_key(|o| **o);
811+
fn set_output_positions<'a>(text: &str, positions: impl Iterator<Item = &'a mut Position>) {
812+
let mut positions = positions.collect::<Vec<_>>();
813+
positions.sort_by_key(|o| o.byte_offset);
775814

776-
let mut offsets_iter = offsets.iter_mut();
777-
let mut next_offset = if let Some(offset) = offsets_iter.next() {
778-
offset
779-
} else {
815+
let mut positions_iter = positions.iter_mut();
816+
let Some(mut next_position) = positions_iter.next() else {
780817
return;
781818
};
782819

783-
let mut char_idx = 0;
784-
for (bytes_idx, _) in text.char_indices() {
785-
while **next_offset == bytes_idx {
786-
**next_offset = char_idx;
787-
next_offset = if let Some(offset) = offsets_iter.next() {
788-
offset
820+
let mut char_offset = 0;
821+
let mut line = 1;
822+
let mut column = 1;
823+
for (byte_offset, ch) in text.char_indices() {
824+
while next_position.byte_offset == byte_offset {
825+
next_position.output = Some(OutputPosition {
826+
char_offset,
827+
line,
828+
column,
829+
});
830+
if let Some(position) = positions_iter.next() {
831+
next_position = position;
789832
} else {
790833
return;
791834
}
792835
}
793-
char_idx += 1;
836+
char_offset += 1;
837+
if ch == '\n' {
838+
line += 1;
839+
column = 1;
840+
} else {
841+
column += 1;
842+
}
794843
}
795844

796845
// Offsets after the last char.
797-
**next_offset = char_idx;
798-
for offset in offsets_iter {
799-
**offset = char_idx;
846+
loop {
847+
next_position.output = Some(OutputPosition {
848+
char_offset,
849+
line,
850+
column,
851+
});
852+
if let Some(position) = positions_iter.next() {
853+
next_position = position;
854+
} else {
855+
return;
856+
}
800857
}
801858
}
802859

@@ -850,16 +907,31 @@ impl SimpleFunctionExecutor for Executor {
850907
})?
851908
};
852909

853-
translate_bytes_to_chars(
910+
set_output_positions(
854911
full_text,
855-
output.iter_mut().flat_map(|(range, _)| {
856-
std::iter::once(&mut range.start).chain(std::iter::once(&mut range.end))
912+
output.iter_mut().flat_map(|chunk_output| {
913+
std::iter::once(&mut chunk_output.start_pos)
914+
.chain(std::iter::once(&mut chunk_output.end_pos))
857915
}),
858916
);
859917

860918
let table = output
861919
.into_iter()
862-
.map(|(range, text)| (range.into(), fields_value!(Arc::<str>::from(text)).into()))
920+
.map(|chunk_output| {
921+
(
922+
RangeValue::new(
923+
chunk_output.start_pos.byte_offset,
924+
chunk_output.end_pos.byte_offset,
925+
)
926+
.into(),
927+
fields_value!(
928+
Arc::<str>::from(chunk_output.text),
929+
chunk_output.start_pos.output.unwrap().into_output(),
930+
chunk_output.end_pos.output.unwrap().into_output()
931+
)
932+
.into(),
933+
)
934+
})
863935
.collect();
864936

865937
Ok(Value::KTable(table))
@@ -901,6 +973,15 @@ impl SimpleFunctionFactoryBase for Factory {
901973
.expect_type(&ValueType::Basic(BasicValueType::Str))?,
902974
};
903975

976+
let pos_struct = schema::ValueType::Struct(schema::StructSchema {
977+
fields: Arc::new(vec![
978+
schema::FieldSchema::new("offset", make_output_type(BasicValueType::Int64)),
979+
schema::FieldSchema::new("line", make_output_type(BasicValueType::Int64)),
980+
schema::FieldSchema::new("column", make_output_type(BasicValueType::Int64)),
981+
]),
982+
description: None,
983+
});
984+
904985
let mut struct_schema = StructSchema::default();
905986
let mut schema_builder = StructSchemaBuilder::new(&mut struct_schema);
906987
schema_builder.add_field(FieldSchema::new(
@@ -911,6 +992,22 @@ impl SimpleFunctionFactoryBase for Factory {
911992
"text",
912993
make_output_type(BasicValueType::Str),
913994
));
995+
schema_builder.add_field(FieldSchema::new(
996+
"start",
997+
schema::EnrichedValueType {
998+
typ: pos_struct.clone(),
999+
nullable: false,
1000+
attrs: Default::default(),
1001+
},
1002+
));
1003+
schema_builder.add_field(FieldSchema::new(
1004+
"end",
1005+
schema::EnrichedValueType {
1006+
typ: pos_struct,
1007+
nullable: false,
1008+
attrs: Default::default(),
1009+
},
1010+
));
9141011
let output_schema = make_output_type(TableSchema::new(TableKind::KTable, struct_schema))
9151012
.with_attr(
9161013
field_attrs::CHUNK_BASE_TEXT,
@@ -940,15 +1037,17 @@ mod tests {
9401037
// Helper function to assert chunk text and its consistency with the range within the original text.
9411038
fn assert_chunk_text_consistency(
9421039
full_text: &str, // Added full text
943-
actual_chunk: &(RangeValue, &str),
1040+
actual_chunk: &ChunkOutput<'_>,
9441041
expected_text: &str,
9451042
context: &str,
9461043
) {
9471044
// Extract text using the chunk's range from the original full text.
948-
let extracted_text = actual_chunk.0.extract_str(full_text);
1045+
let extracted_text = full_text
1046+
.get(actual_chunk.start_pos.byte_offset..actual_chunk.end_pos.byte_offset)
1047+
.unwrap();
9491048
// Assert that the expected text matches the text provided in the chunk.
9501049
assert_eq!(
951-
actual_chunk.1, expected_text,
1050+
actual_chunk.text, expected_text,
9521051
"Provided chunk text mismatch - {}",
9531052
context
9541053
);
@@ -978,13 +1077,13 @@ mod tests {
9781077
#[test]
9791078
fn test_translate_bytes_to_chars_simple() {
9801079
let text = "abc😄def";
981-
let mut start1 = 0;
982-
let mut end1 = 3;
983-
let mut start2 = 3;
984-
let mut end2 = 7;
985-
let mut start3 = 7;
986-
let mut end3 = 10;
987-
let mut end_full = text.len();
1080+
let mut start1 = Position::new(0);
1081+
let mut end1 = Position::new(3);
1082+
let mut start2 = Position::new(3);
1083+
let mut end2 = Position::new(7);
1084+
let mut start3 = Position::new(7);
1085+
let mut end3 = Position::new(10);
1086+
let mut end_full = Position::new(text.len());
9881087

9891088
let offsets = vec![
9901089
&mut start1,
@@ -996,15 +1095,56 @@ mod tests {
9961095
&mut end_full,
9971096
];
9981097

999-
translate_bytes_to_chars(text, offsets.into_iter());
1098+
set_output_positions(text, offsets.into_iter());
10001099

1001-
assert_eq!(start1, 0);
1002-
assert_eq!(end1, 3);
1003-
assert_eq!(start2, 3);
1004-
assert_eq!(end2, 4);
1005-
assert_eq!(start3, 4);
1006-
assert_eq!(end3, 7);
1007-
assert_eq!(end_full, 7);
1100+
assert_eq!(
1101+
start1.output,
1102+
Some(OutputPosition {
1103+
char_offset: 0,
1104+
line: 1,
1105+
column: 1,
1106+
})
1107+
);
1108+
assert_eq!(
1109+
end1.output,
1110+
Some(OutputPosition {
1111+
char_offset: 3,
1112+
line: 1,
1113+
column: 4,
1114+
})
1115+
);
1116+
assert_eq!(
1117+
start2.output,
1118+
Some(OutputPosition {
1119+
char_offset: 3,
1120+
line: 1,
1121+
column: 4,
1122+
})
1123+
);
1124+
assert_eq!(
1125+
end2.output,
1126+
Some(OutputPosition {
1127+
char_offset: 4,
1128+
line: 1,
1129+
column: 5,
1130+
})
1131+
);
1132+
assert_eq!(
1133+
end3.output,
1134+
Some(OutputPosition {
1135+
char_offset: 7,
1136+
line: 1,
1137+
column: 8,
1138+
})
1139+
);
1140+
assert_eq!(
1141+
end_full.output,
1142+
Some(OutputPosition {
1143+
char_offset: 7,
1144+
line: 1,
1145+
column: 8,
1146+
})
1147+
);
10081148
}
10091149

10101150
#[test]
@@ -1039,7 +1179,7 @@ mod tests {
10391179
// Expect multiple chunks, likely split by spaces due to chunk_size.
10401180
assert!(chunks2.len() > 1);
10411181
assert_chunk_text_consistency(text2, &chunks2[0], "A very very long", "Test 2, Chunk 0");
1042-
assert!(chunks2[0].1.len() <= 20);
1182+
assert!(chunks2[0].text.len() <= 20);
10431183
}
10441184
#[test]
10451185
fn test_basic_split_with_overlap() {
@@ -1057,10 +1197,7 @@ mod tests {
10571197
assert!(chunks.len() > 1);
10581198

10591199
if chunks.len() >= 2 {
1060-
let _chunk1_text = chunks[0].1;
1061-
let _chunk2_text = chunks[1].1;
1062-
1063-
assert!(chunks[0].1.len() <= 25);
1200+
assert!(chunks[0].text.len() <= 25);
10641201
}
10651202
}
10661203
#[test]

0 commit comments

Comments
 (0)