11use std:: collections:: HashMap ;
22
3+ /// A struct which contains information about line numbers of a source file,
4+ /// and can convert between byte offsets that are used in the compiler and
5+ /// line-column pairs used in LSP.
36#[ derive( Debug , Clone , serde:: Serialize , serde:: Deserialize , PartialEq , Eq ) ]
47pub struct LineNumbers {
8+ /// The byte offsets of the start of each line of the source file
59 pub line_starts : Vec < u32 > ,
10+ /// The total length of the source file
611 pub length : u32 ,
12+ /// A mapping of byte offsets to character length information. This is used
13+ /// when converting between byte indices and line-column numbers, because
14+ /// LSP uses UTF-16, while Rust encodes strings as UTF-8.
15+ ///
16+ /// This only contains characters which are more than one byte in UTF-8,
17+ /// because one byte UTF-8 characters are one UTF-16 segment also, so no
18+ /// translation is needed.
19+ ///
20+ /// We could store the whole source file here instead, however that would
21+ /// be quite wasteful. Most Gleam programs use only ASCII characters, meaning
22+ /// UTF-8 offsets are the same as UTF-16 ones. With this representation, we
23+ /// only need to store a few characters.
24+ ///
25+ /// In most programs this will be empty because they will only be using
26+ /// ASCII characters.
727 pub mapping : HashMap < usize , Character > ,
828}
929
30+ /// Information about how a character is encoded in UTF-8 and UTF-16.
1031#[ derive( Debug , Clone , Copy , serde:: Serialize , serde:: Deserialize , PartialEq , Eq ) ]
1132pub struct Character {
33+ /// The number of bytes needed to encode this in UTF-8.
1234 pub length_utf8 : u8 ,
35+ /// The number of 16-bit segments needed to encode this in UTF-16.
1336 pub length_utf16 : u8 ,
1437}
1538
@@ -43,14 +66,16 @@ impl LineNumbers {
4366 map
4467 }
4568
46- /// Get the line number for a byte index
69+ /// Returns the 1-indexed line number of a given byte index
4770 pub fn line_number ( & self , byte_index : u32 ) -> u32 {
4871 self . line_starts
4972 . binary_search ( & byte_index)
5073 . unwrap_or_else ( |next_line| next_line - 1 ) as u32
5174 + 1
5275 }
5376
77+ /// Returns the 1-indexed line and column number of a given byte index,
78+ /// using a UTF-16 character offset.
5479 pub fn line_and_column_number ( & self , byte_index : u32 ) -> LineColumn {
5580 let line = self . line_number ( byte_index) ;
5681 let line_start = self
@@ -82,9 +107,10 @@ impl LineNumbers {
82107 }
83108 }
84109
85- /// 0 indexed line and character to byte index
110+ /// Returns the byte index of the corresponding 1-indexed line and column
111+ /// numbers, translating from a UTF-16 character index to a UTF-8 byte index.
86112 pub fn byte_index ( & self , line : u32 , character : u32 ) -> u32 {
87- let line_start = match self . line_starts . get ( line as usize ) {
113+ let line_start = match self . line_starts . get ( line as usize - 1 ) {
88114 Some ( & line_start) => line_start,
89115 None => return self . length ,
90116 } ;
@@ -93,7 +119,7 @@ impl LineNumbers {
93119 let mut u16_offset = 0 ;
94120
95121 loop {
96- if u16_offset >= character {
122+ if u16_offset >= character - 1 {
97123 break ;
98124 }
99125
@@ -120,10 +146,10 @@ pub fn main() {
120146"# ;
121147 let line_numbers = LineNumbers :: new ( src) ;
122148
123- assert_eq ! ( line_numbers. byte_index( 0 , 0 ) , 0 ) ;
124- assert_eq ! ( line_numbers. byte_index( 0 , 4 ) , 4 ) ;
149+ assert_eq ! ( line_numbers. byte_index( 1 , 1 ) , 0 ) ;
150+ assert_eq ! ( line_numbers. byte_index( 1 , 5 ) , 4 ) ;
125151 assert_eq ! ( line_numbers. byte_index( 100 , 1 ) , src. len( ) as u32 ) ;
126- assert_eq ! ( line_numbers. byte_index( 2 , 1 ) , 18 ) ;
152+ assert_eq ! ( line_numbers. byte_index( 3 , 2 ) , 18 ) ;
127153}
128154
129155// https://github.com/gleam-lang/gleam/issues/3628
@@ -139,10 +165,10 @@ pub fn main() {
139165"# ;
140166 let line_numbers = LineNumbers :: new ( src) ;
141167
142- assert_eq ! ( line_numbers. byte_index( 1 , 6 ) , 30 ) ;
143- assert_eq ! ( line_numbers. byte_index( 5 , 2 ) , 52 ) ;
144- assert_eq ! ( line_numbers. byte_index( 5 , 17 ) , 75 ) ;
145- assert_eq ! ( line_numbers. byte_index( 6 , 1 ) , 91 ) ;
168+ assert_eq ! ( line_numbers. byte_index( 2 , 7 ) , 30 ) ;
169+ assert_eq ! ( line_numbers. byte_index( 6 , 3 ) , 52 ) ;
170+ assert_eq ! ( line_numbers. byte_index( 6 , 18 ) , 75 ) ;
171+ assert_eq ! ( line_numbers. byte_index( 7 , 2 ) , 91 ) ;
146172}
147173
148174// https://github.com/gleam-lang/gleam/issues/3628
0 commit comments