1
1
// Copyright (C) 2020 Quentin M. Kniep <[email protected] >
2
2
// Distributed under terms of the MIT license.
3
3
4
- const legacyImpl = {
5
- at ( s: string, end: number, i: number) : number {
6
- return i < end ? s. charCodeAt ( i) : -1
7
- } ,
8
- width ( c: number) : number {
9
- return 1
10
- } ,
11
- }
12
-
13
- const unicodeImpl = {
14
- at ( s: string, end: number, i: number) : number {
15
- return i < end ? s. codePointAt ( i) ! : -1
16
- } ,
17
- width ( c: number) : number {
18
- return c > 0xffff ? 2 : 1
19
- } ,
20
- }
4
+ use std:: collections:: VecDeque ;
21
5
22
- struct Reader {
23
- implem = legacyImpl;
24
- src : & str ,
6
+ #[ derive( Debug ) ]
7
+ pub struct Reader {
8
+ unicode : bool ,
9
+ src : String ,
25
10
index : usize ,
26
11
end : usize ,
27
- cp1 : Option < char > ,
28
- w1 : usize ,
29
- cp2 : Option < char > ,
30
- w2 : usize ,
31
- cp3 : Option < char > ,
32
- w3 : usize ,
33
- cp4 : Option < char > ,
12
+ cps : VecDeque < char > ,
13
+ widths : VecDeque < usize > ,
34
14
}
35
15
36
16
impl Reader {
37
17
pub fn new ( ) -> Self {
38
18
Self {
39
- implem : legacyImpl ,
40
- src : "" ,
19
+ unicode : false ,
20
+ src : "" . to_string ( ) ,
41
21
index : 0 ,
42
22
end : 0 ,
43
- cp1 : None ,
44
- w1 : 1 ,
45
- cp2 : None ,
46
- w2 : 1 ,
47
- cp3 : None ,
48
- w3 : 1 ,
49
- cp4 : None ,
50
- w4 : 1 ,
23
+ cps : VecDeque :: with_capacity ( 4 ) ,
24
+ widths : VecDeque :: with_capacity ( 3 ) ,
51
25
}
52
26
}
53
27
54
28
pub fn source ( & self ) -> & str {
55
- self . src
29
+ & self . src
56
30
}
57
31
58
32
pub fn index ( & self ) -> usize {
59
33
self . index
60
34
}
61
35
62
- pub fn currentCodePoint ( & self ) -> char {
63
- self . cp1
64
- }
65
-
66
- pub fn nextCodePoint ( ) -> char {
67
- self . cp2
68
- }
69
-
70
- pub fn nextCodePoint2 ( ) -> char {
71
- self . cp3
36
+ pub fn code_point_with_offset ( & self , offset : usize ) -> Option < & char > {
37
+ self . cps . get ( offset)
72
38
}
73
39
74
- pub fn nextCodePoint3 ( ) -> char {
75
- self . cp4
76
- }
77
-
78
- pub fn reset (
79
- & mut self ,
80
- source : & str ,
81
- start : usize ,
82
- end : usize ,
83
- uFlag : bool ,
84
- ) {
85
- self . implem = uFlag ? unicodeImpl : legacyImpl;
86
- self . src = source;
40
+ pub fn reset ( & mut self , source : & str , start : usize , end : usize , u_flag : bool ) {
41
+ self . unicode = u_flag;
42
+ self . src = source. into ( ) ;
87
43
self . end = end;
88
44
self . rewind ( start) ;
89
45
}
90
46
91
47
pub fn rewind ( & mut self , index : usize ) {
92
- let implem = self . implem ;
93
48
self . index = index;
94
- self . cp1 = implem . at ( self . src , self . end , index ) ;
95
- self . w1 = implem . width ( this . cp1 ) ;
96
- self . cp2 = implem . at ( self . src , self . end , index + self . w1 ) ;
97
- self . w2 = implem . width ( this . cp2 ) ;
98
- self . cp3 = implem . at ( self . src , self . end , index + self . w1 + self . w2 ) ;
99
- self . w3 = implem . width ( self . cp3 ) ;
100
- self . cp4 = implem . at (
101
- self . src ,
102
- self . end ,
103
- index + self . w1 + self . w2 + self . w3 ,
104
- ) ;
49
+ self . cps . clear ( ) ;
50
+ self . widths . clear ( ) ;
51
+ for i in 0 .. 4 {
52
+ let w_sum : usize = self . widths . iter ( ) . take ( i ) . sum ( ) ;
53
+ if let Some ( c ) = self . at ( index + w_sum ) {
54
+ self . cps . push_back ( c ) ;
55
+ self . widths . push_back ( self . width ( c ) ) ;
56
+ } else {
57
+ break ;
58
+ }
59
+ }
105
60
}
106
61
107
62
pub fn advance ( & mut self ) {
108
- if self . cp1 != -1 {
109
- let implem = self . implem ;
110
- self . index += self . w1 ;
111
- self . cp1 = self . cp2 ;
112
- self . w1 = self . w2 ;
113
- self . cp2 = self . cp3 ;
114
- self . w2 = implem. width ( self . cp2 ) ;
115
- self . cp3 = self . cp4 ;
116
- self . w3 = implem. width ( self . cp3 ) ;
117
- self . cp4 = implem. at (
118
- self . src ,
119
- self . end ,
120
- self . index + self . w1 + self . w2 + self . w3 ,
121
- ) ;
63
+ if self . cps . get ( 0 ) . is_some ( ) {
64
+ self . index += self . widths [ 0 ] ;
65
+ self . cps . pop_front ( ) ;
66
+ self . widths . pop_front ( ) ;
67
+ let w_sum: usize = self . widths . iter ( ) . sum ( ) ;
68
+ if let Some ( c) = self . at ( self . index + w_sum) {
69
+ self . widths . push_back ( self . width ( * self . cps . back ( ) . unwrap ( ) ) ) ;
70
+ self . cps . push_back ( c) ;
71
+ }
122
72
}
73
+ println ! ( "{:?}" , self . cps) ;
74
+ println ! ( "{:?}" , self . widths) ;
123
75
}
124
76
125
77
pub fn eat ( & mut self , cp : char ) -> bool {
126
- if self . cp1 == cp {
78
+ let opt = self . cps . get ( 0 ) ;
79
+ if opt. is_some ( ) && * opt. unwrap ( ) == cp {
127
80
self . advance ( ) ;
128
81
return true ;
129
82
}
130
83
return false ;
131
84
}
132
85
133
86
pub fn eat2 ( & mut self , cp1 : char , cp2 : char ) -> bool {
134
- if self . cp1 == cp1 && self . cp2 == cp2 {
87
+ let ( opt1, opt2) = ( self . cps . get ( 0 ) , self . cps . get ( 1 ) ) ;
88
+ if opt1. is_some ( ) && opt2. is_some ( ) && * opt1. unwrap ( ) == cp1 && * opt2. unwrap ( ) == cp2 {
135
89
self . advance ( ) ;
136
90
self . advance ( ) ;
137
91
return true ;
@@ -140,24 +94,53 @@ impl Reader {
140
94
}
141
95
142
96
pub fn eat3 ( & mut self , cp1 : char , cp2 : char , cp3 : char ) -> bool {
143
- if self . cp1 == cp1 && self . cp2 == cp2 && self . cp3 == cp3 {
97
+ let ( opt1, opt2, opt3) = ( self . cps . get ( 0 ) , self . cps . get ( 1 ) , self . cps . get ( 2 ) ) ;
98
+ if opt1. is_some ( )
99
+ && opt2. is_some ( )
100
+ && opt3. is_some ( )
101
+ && * opt1. unwrap ( ) == cp1
102
+ && * opt2. unwrap ( ) == cp2
103
+ && * opt3. unwrap ( ) == cp3
104
+ {
144
105
self . advance ( ) ;
145
106
self . advance ( ) ;
146
107
self . advance ( ) ;
147
108
return true ;
148
109
}
149
110
return false ;
150
111
}
151
- }
152
112
113
+ fn at ( & self , i : usize ) -> Option < char > {
114
+ println ! ( "{:x?}" , self . src. as_bytes( ) ) ;
115
+ if i >= self . end {
116
+ None
117
+ } else if self . unicode {
118
+ // TODO: read non ASCII as UTF-8
119
+ let c: char = self . src . as_bytes ( ) [ i] . into ( ) ;
120
+ Some ( c)
121
+ } else {
122
+ // TODO: read non ASCII as UTF-16
123
+ let c: char = self . src . as_bytes ( ) [ i] . into ( ) ;
124
+ Some ( c)
125
+ }
126
+ }
127
+
128
+ fn width ( & self , c : char ) -> usize {
129
+ if self . unicode && c > '\u{FFFF}' {
130
+ 2
131
+ } else {
132
+ 1
133
+ }
134
+ }
135
+ }
153
136
154
137
#[ cfg( test) ]
155
138
mod tests {
156
139
use super :: * ;
157
140
158
141
#[ test]
159
142
fn eat_test ( ) {
160
- let reader = Reader :: new ( ) ;
143
+ let mut reader = Reader :: new ( ) ;
161
144
reader. reset ( "abcdefghijk" , 0 , 11 , true ) ;
162
145
assert_eq ! ( reader. eat( 'a' ) , true ) ;
163
146
assert_eq ! ( reader. eat3( 'b' , 'd' , 'd' ) , false ) ;
@@ -168,4 +151,26 @@ mod tests {
168
151
assert_eq ! ( reader. eat2( 'h' , 'i' ) , true ) ;
169
152
assert_eq ! ( reader. eat3( 'j' , 'k' , 'a' ) , false ) ;
170
153
}
154
+
155
+ #[ test]
156
+ fn rewind_test ( ) {
157
+ let mut reader = Reader :: new ( ) ;
158
+ reader. reset ( "abcd" , 0 , 4 , true ) ;
159
+ assert_eq ! ( reader. eat( 'a' ) , true ) ;
160
+ assert_eq ! ( reader. eat3( 'b' , 'd' , 'd' ) , false ) ;
161
+ assert_eq ! ( reader. eat3( 'b' , 'c' , 'd' ) , true ) ;
162
+ reader. rewind ( 0 ) ;
163
+ assert_eq ! ( reader. eat( 'a' ) , true ) ;
164
+ assert_eq ! ( reader. eat3( 'b' , 'd' , 'd' ) , false ) ;
165
+ assert_eq ! ( reader. eat3( 'b' , 'c' , 'd' ) , true ) ;
166
+ }
167
+
168
+ /*#[test]
169
+ fn at_test_es_compliance() {
170
+ let mut reader = Reader::new();
171
+ reader.reset("a🩢☃★♲", 0, 20, false);
172
+ assert_eq!(reader.at(0).unwrap() as u32, 56256);
173
+ reader.reset("a🩢☃★♲", 0, 20, true);
174
+ assert_eq!(reader.at(0).unwrap() as u32, 1048771);
175
+ }*/
171
176
}
0 commit comments