@@ -464,6 +464,42 @@ impl<'s> AtomChunksCollector<'s> {
464464 }
465465}
466466
467+ #[ derive( Debug , Clone , PartialEq , Eq ) ]
468+ struct OutputPosition {
469+ char_offset : usize ,
470+ line : u32 ,
471+ column : u32 ,
472+ }
473+
474+ impl OutputPosition {
475+ fn into_output ( self ) -> value:: Value {
476+ value:: Value :: Struct ( fields_value ! (
477+ self . char_offset as i64 ,
478+ self . line as i64 ,
479+ self . column as i64
480+ ) )
481+ }
482+ }
483+ struct Position {
484+ byte_offset : usize ,
485+ output : Option < OutputPosition > ,
486+ }
487+
488+ impl Position {
489+ fn new ( byte_offset : usize ) -> Self {
490+ Self {
491+ byte_offset,
492+ output : None ,
493+ }
494+ }
495+ }
496+
497+ struct ChunkOutput < ' s > {
498+ start_pos : Position ,
499+ end_pos : Position ,
500+ text : & ' s str ,
501+ }
502+
467503struct RecursiveChunker < ' s > {
468504 full_text : & ' s str ,
469505 chunk_size : usize ,
@@ -551,7 +587,7 @@ impl<'t, 's: 't> RecursiveChunker<'s> {
551587 }
552588 }
553589
554- fn merge_atom_chunks ( & self , atom_chunks : Vec < AtomChunk > ) -> Vec < ( RangeValue , & ' s str ) > {
590+ fn merge_atom_chunks ( & self , atom_chunks : Vec < AtomChunk > ) -> Vec < ChunkOutput < ' s > > {
555591 struct AtomRoutingPlan {
556592 start_idx : usize , // index of `atom_chunks` for the start chunk
557593 prev_plan_idx : usize , // index of `plans` for the previous plan
@@ -687,15 +723,18 @@ impl<'t, 's: 't> RecursiveChunker<'s> {
687723 let plan = & plans[ plan_idx] ;
688724 let start_chunk = & atom_chunks[ plan. start_idx ] ;
689725 let end_chunk = & atom_chunks[ plan_idx - 1 ] ;
690- let range = RangeValue :: new ( start_chunk. range . start , end_chunk. range . end ) ;
691- output. push ( ( range, & self . full_text [ range. start ..range. end ] ) ) ;
726+ output. push ( ChunkOutput {
727+ start_pos : Position :: new ( start_chunk. range . start ) ,
728+ end_pos : Position :: new ( end_chunk. range . end ) ,
729+ text : & self . full_text [ start_chunk. range . start ..end_chunk. range . end ] ,
730+ } ) ;
692731 plan_idx = plan. prev_plan_idx ;
693732 }
694733 output. reverse ( ) ;
695734 output
696735 }
697736
698- fn split_root_chunk ( & self , kind : ChunkKind < ' t > ) -> Result < Vec < ( RangeValue , & ' s str ) > > {
737+ fn split_root_chunk ( & self , kind : ChunkKind < ' t > ) -> Result < Vec < ChunkOutput < ' s > > > {
699738 let mut atom_collector = AtomChunksCollector {
700739 full_text : self . full_text ,
701740 min_level : 0 ,
@@ -769,34 +808,52 @@ impl Executor {
769808 }
770809}
771810
772- fn translate_bytes_to_chars < ' a > ( text : & str , offsets : impl Iterator < Item = & ' a mut usize > ) {
773- let mut offsets = offsets . collect :: < Vec < _ > > ( ) ;
774- offsets . sort_by_key ( |o| * * o ) ;
811+ fn set_output_positions < ' a > ( text : & str , positions : impl Iterator < Item = & ' a mut Position > ) {
812+ let mut positions = positions . collect :: < Vec < _ > > ( ) ;
813+ positions . sort_by_key ( |o| o . byte_offset ) ;
775814
776- let mut offsets_iter = offsets. iter_mut ( ) ;
777- let mut next_offset = if let Some ( offset) = offsets_iter. next ( ) {
778- offset
779- } else {
815+ let mut positions_iter = positions. iter_mut ( ) ;
816+ let Some ( mut next_position) = positions_iter. next ( ) else {
780817 return ;
781818 } ;
782819
783- let mut char_idx = 0 ;
784- for ( bytes_idx, _) in text. char_indices ( ) {
785- while * * next_offset == bytes_idx {
786- * * next_offset = char_idx;
787- next_offset = if let Some ( offset) = offsets_iter. next ( ) {
788- offset
820+ let mut char_offset = 0 ;
821+ let mut line = 1 ;
822+ let mut column = 1 ;
823+ for ( byte_offset, ch) in text. char_indices ( ) {
824+ while next_position. byte_offset == byte_offset {
825+ next_position. output = Some ( OutputPosition {
826+ char_offset,
827+ line,
828+ column,
829+ } ) ;
830+ if let Some ( position) = positions_iter. next ( ) {
831+ next_position = position;
789832 } else {
790833 return ;
791834 }
792835 }
793- char_idx += 1 ;
836+ char_offset += 1 ;
837+ if ch == '\n' {
838+ line += 1 ;
839+ column = 1 ;
840+ } else {
841+ column += 1 ;
842+ }
794843 }
795844
796845 // Offsets after the last char.
797- * * next_offset = char_idx;
798- for offset in offsets_iter {
799- * * offset = char_idx;
846+ loop {
847+ next_position. output = Some ( OutputPosition {
848+ char_offset,
849+ line,
850+ column,
851+ } ) ;
852+ if let Some ( position) = positions_iter. next ( ) {
853+ next_position = position;
854+ } else {
855+ return ;
856+ }
800857 }
801858}
802859
@@ -850,16 +907,31 @@ impl SimpleFunctionExecutor for Executor {
850907 } ) ?
851908 } ;
852909
853- translate_bytes_to_chars (
910+ set_output_positions (
854911 full_text,
855- output. iter_mut ( ) . flat_map ( |( range, _) | {
856- std:: iter:: once ( & mut range. start ) . chain ( std:: iter:: once ( & mut range. end ) )
912+ output. iter_mut ( ) . flat_map ( |chunk_output| {
913+ std:: iter:: once ( & mut chunk_output. start_pos )
914+ . chain ( std:: iter:: once ( & mut chunk_output. end_pos ) )
857915 } ) ,
858916 ) ;
859917
860918 let table = output
861919 . into_iter ( )
862- . map ( |( range, text) | ( range. into ( ) , fields_value ! ( Arc :: <str >:: from( text) ) . into ( ) ) )
920+ . map ( |chunk_output| {
921+ (
922+ RangeValue :: new (
923+ chunk_output. start_pos . byte_offset ,
924+ chunk_output. end_pos . byte_offset ,
925+ )
926+ . into ( ) ,
927+ fields_value ! (
928+ Arc :: <str >:: from( chunk_output. text) ,
929+ chunk_output. start_pos. output. unwrap( ) . into_output( ) ,
930+ chunk_output. end_pos. output. unwrap( ) . into_output( )
931+ )
932+ . into ( ) ,
933+ )
934+ } )
863935 . collect ( ) ;
864936
865937 Ok ( Value :: KTable ( table) )
@@ -901,6 +973,15 @@ impl SimpleFunctionFactoryBase for Factory {
901973 . expect_type ( & ValueType :: Basic ( BasicValueType :: Str ) ) ?,
902974 } ;
903975
976+ let pos_struct = schema:: ValueType :: Struct ( schema:: StructSchema {
977+ fields : Arc :: new ( vec ! [
978+ schema:: FieldSchema :: new( "offset" , make_output_type( BasicValueType :: Int64 ) ) ,
979+ schema:: FieldSchema :: new( "line" , make_output_type( BasicValueType :: Int64 ) ) ,
980+ schema:: FieldSchema :: new( "column" , make_output_type( BasicValueType :: Int64 ) ) ,
981+ ] ) ,
982+ description : None ,
983+ } ) ;
984+
904985 let mut struct_schema = StructSchema :: default ( ) ;
905986 let mut schema_builder = StructSchemaBuilder :: new ( & mut struct_schema) ;
906987 schema_builder. add_field ( FieldSchema :: new (
@@ -911,6 +992,22 @@ impl SimpleFunctionFactoryBase for Factory {
911992 "text" ,
912993 make_output_type ( BasicValueType :: Str ) ,
913994 ) ) ;
995+ schema_builder. add_field ( FieldSchema :: new (
996+ "start" ,
997+ schema:: EnrichedValueType {
998+ typ : pos_struct. clone ( ) ,
999+ nullable : false ,
1000+ attrs : Default :: default ( ) ,
1001+ } ,
1002+ ) ) ;
1003+ schema_builder. add_field ( FieldSchema :: new (
1004+ "end" ,
1005+ schema:: EnrichedValueType {
1006+ typ : pos_struct,
1007+ nullable : false ,
1008+ attrs : Default :: default ( ) ,
1009+ } ,
1010+ ) ) ;
9141011 let output_schema = make_output_type ( TableSchema :: new ( TableKind :: KTable , struct_schema) )
9151012 . with_attr (
9161013 field_attrs:: CHUNK_BASE_TEXT ,
@@ -940,15 +1037,17 @@ mod tests {
9401037 // Helper function to assert chunk text and its consistency with the range within the original text.
9411038 fn assert_chunk_text_consistency (
9421039 full_text : & str , // Added full text
943- actual_chunk : & ( RangeValue , & str ) ,
1040+ actual_chunk : & ChunkOutput < ' _ > ,
9441041 expected_text : & str ,
9451042 context : & str ,
9461043 ) {
9471044 // Extract text using the chunk's range from the original full text.
948- let extracted_text = actual_chunk. 0 . extract_str ( full_text) ;
1045+ let extracted_text = full_text
1046+ . get ( actual_chunk. start_pos . byte_offset ..actual_chunk. end_pos . byte_offset )
1047+ . unwrap ( ) ;
9491048 // Assert that the expected text matches the text provided in the chunk.
9501049 assert_eq ! (
951- actual_chunk. 1 , expected_text,
1050+ actual_chunk. text , expected_text,
9521051 "Provided chunk text mismatch - {}" ,
9531052 context
9541053 ) ;
@@ -978,13 +1077,13 @@ mod tests {
9781077 #[ test]
9791078 fn test_translate_bytes_to_chars_simple ( ) {
9801079 let text = "abc😄def" ;
981- let mut start1 = 0 ;
982- let mut end1 = 3 ;
983- let mut start2 = 3 ;
984- let mut end2 = 7 ;
985- let mut start3 = 7 ;
986- let mut end3 = 10 ;
987- let mut end_full = text. len ( ) ;
1080+ let mut start1 = Position :: new ( 0 ) ;
1081+ let mut end1 = Position :: new ( 3 ) ;
1082+ let mut start2 = Position :: new ( 3 ) ;
1083+ let mut end2 = Position :: new ( 7 ) ;
1084+ let mut start3 = Position :: new ( 7 ) ;
1085+ let mut end3 = Position :: new ( 10 ) ;
1086+ let mut end_full = Position :: new ( text. len ( ) ) ;
9881087
9891088 let offsets = vec ! [
9901089 & mut start1,
@@ -996,15 +1095,56 @@ mod tests {
9961095 & mut end_full,
9971096 ] ;
9981097
999- translate_bytes_to_chars ( text, offsets. into_iter ( ) ) ;
1098+ set_output_positions ( text, offsets. into_iter ( ) ) ;
10001099
1001- assert_eq ! ( start1, 0 ) ;
1002- assert_eq ! ( end1, 3 ) ;
1003- assert_eq ! ( start2, 3 ) ;
1004- assert_eq ! ( end2, 4 ) ;
1005- assert_eq ! ( start3, 4 ) ;
1006- assert_eq ! ( end3, 7 ) ;
1007- assert_eq ! ( end_full, 7 ) ;
1100+ assert_eq ! (
1101+ start1. output,
1102+ Some ( OutputPosition {
1103+ char_offset: 0 ,
1104+ line: 1 ,
1105+ column: 1 ,
1106+ } )
1107+ ) ;
1108+ assert_eq ! (
1109+ end1. output,
1110+ Some ( OutputPosition {
1111+ char_offset: 3 ,
1112+ line: 1 ,
1113+ column: 4 ,
1114+ } )
1115+ ) ;
1116+ assert_eq ! (
1117+ start2. output,
1118+ Some ( OutputPosition {
1119+ char_offset: 3 ,
1120+ line: 1 ,
1121+ column: 4 ,
1122+ } )
1123+ ) ;
1124+ assert_eq ! (
1125+ end2. output,
1126+ Some ( OutputPosition {
1127+ char_offset: 4 ,
1128+ line: 1 ,
1129+ column: 5 ,
1130+ } )
1131+ ) ;
1132+ assert_eq ! (
1133+ end3. output,
1134+ Some ( OutputPosition {
1135+ char_offset: 7 ,
1136+ line: 1 ,
1137+ column: 8 ,
1138+ } )
1139+ ) ;
1140+ assert_eq ! (
1141+ end_full. output,
1142+ Some ( OutputPosition {
1143+ char_offset: 7 ,
1144+ line: 1 ,
1145+ column: 8 ,
1146+ } )
1147+ ) ;
10081148 }
10091149
10101150 #[ test]
@@ -1039,7 +1179,7 @@ mod tests {
10391179 // Expect multiple chunks, likely split by spaces due to chunk_size.
10401180 assert ! ( chunks2. len( ) > 1 ) ;
10411181 assert_chunk_text_consistency ( text2, & chunks2[ 0 ] , "A very very long" , "Test 2, Chunk 0" ) ;
1042- assert ! ( chunks2[ 0 ] . 1 . len( ) <= 20 ) ;
1182+ assert ! ( chunks2[ 0 ] . text . len( ) <= 20 ) ;
10431183 }
10441184 #[ test]
10451185 fn test_basic_split_with_overlap ( ) {
@@ -1057,10 +1197,7 @@ mod tests {
10571197 assert ! ( chunks. len( ) > 1 ) ;
10581198
10591199 if chunks. len ( ) >= 2 {
1060- let _chunk1_text = chunks[ 0 ] . 1 ;
1061- let _chunk2_text = chunks[ 1 ] . 1 ;
1062-
1063- assert ! ( chunks[ 0 ] . 1 . len( ) <= 25 ) ;
1200+ assert ! ( chunks[ 0 ] . text. len( ) <= 25 ) ;
10641201 }
10651202 }
10661203 #[ test]
0 commit comments