92
92
Ok ( ( ) )
93
93
}
94
94
95
+ /// Decode UCS-2 string to UTF-8 with a custom callback function.
96
+ ///
97
+ /// `output` is a function which receives every decoded character.
98
+ pub fn decode_with < F > ( input : & [ u16 ] , mut output : F ) -> Result < usize >
99
+ where
100
+ F : FnMut ( & [ u8 ] ) -> Result < ( ) > ,
101
+ {
102
+ let mut written = 0 ;
103
+
104
+ for ch in input. iter ( ) {
105
+ /*
106
+ * We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
107
+ * the Basic Multilingual Plane, a maximum of three bytes are needed.
108
+ */
109
+ if ( 0x000 ..0x0080 ) . contains ( ch) {
110
+ output ( & [ * ch as u8 ] ) ?;
111
+
112
+ written += 1 ;
113
+ } else if ( 0x0080 ..0x0800 ) . contains ( ch) {
114
+ let first = 0b1100_0000 + ch. get_bits ( 6 ..11 ) as u8 ;
115
+ let last = 0b1000_0000 + ch. get_bits ( 0 ..6 ) as u8 ;
116
+
117
+ output ( & [ first, last] ) ?;
118
+
119
+ written += 2 ;
120
+ } else {
121
+ let first = 0b1110_0000 + ch. get_bits ( 12 ..16 ) as u8 ;
122
+ let mid = 0b1000_0000 + ch. get_bits ( 6 ..12 ) as u8 ;
123
+ let last = 0b1000_0000 + ch. get_bits ( 0 ..6 ) as u8 ;
124
+
125
+ output ( & [ first, mid, last] ) ?;
126
+
127
+ written += 3 ;
128
+ }
129
+ }
130
+
131
+ Ok ( written)
132
+ }
133
+
95
134
/// Decode an input UCS-2 string into a UTF-8 string.
96
135
///
97
136
/// The returned `usize` represents the length of the returned buffer,
@@ -100,42 +139,43 @@ pub fn decode(input: &[u16], output: &mut [u8]) -> Result<usize> {
100
139
let buffer_size = output. len ( ) ;
101
140
let mut i = 0 ;
102
141
103
- for & ch in input. iter ( ) {
104
- /*
105
- * We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
106
- * the Basic Multilingual Plane, a maximum of three bytes are needed.
107
- */
108
- if ( 0x0000 ..0x0080 ) . contains ( & ch) {
142
+ decode_with ( input, |bytes| {
143
+ if bytes. len ( ) == 1 {
109
144
// Can be encoded in a single byte
110
145
if i >= buffer_size {
111
146
return Err ( Error :: BufferOverflow ) ;
112
147
}
113
148
114
- output[ i] = ch as u8 ;
149
+ output[ i] = bytes[ 0 ] ;
150
+
115
151
i += 1 ;
116
- } else if ( 0x0080 .. 0x0800 ) . contains ( & ch ) {
117
- // Can be encoded as two bytes
118
- if ( i + 1 ) >= buffer_size {
152
+ } else if bytes . len ( ) == 2 {
153
+ // Can be encoded two bytes
154
+ if i + 1 >= buffer_size {
119
155
return Err ( Error :: BufferOverflow ) ;
120
156
}
121
157
122
- output[ i] = 0b1100_0000 + ch. get_bits ( 6 ..11 ) as u8 ;
123
- output[ i + 1 ] = 0b1000_0000 + ch. get_bits ( 0 ..6 ) as u8 ;
158
+ output[ i] = bytes[ 0 ] ;
159
+ output[ i + 1 ] = bytes[ 1 ] ;
160
+
124
161
i += 2 ;
125
- } else {
126
- // Can be encoded as three bytes
127
- if ( i + 2 ) >= buffer_size {
162
+ } else if bytes . len ( ) == 3 {
163
+ // Can be encoded three bytes
164
+ if i + 2 >= buffer_size {
128
165
return Err ( Error :: BufferOverflow ) ;
129
166
}
130
167
131
- output[ i] = 0b1110_0000 + ch. get_bits ( 12 ..16 ) as u8 ;
132
- output[ i + 1 ] = 0b1000_0000 + ch. get_bits ( 6 ..12 ) as u8 ;
133
- output[ i + 2 ] = 0b1000_0000 + ch. get_bits ( 0 ..6 ) as u8 ;
168
+ output[ i] = bytes[ 0 ] ;
169
+ output[ i + 1 ] = bytes[ 1 ] ;
170
+ output[ i + 2 ] = bytes[ 2 ] ;
171
+
134
172
i += 3 ;
173
+ } else {
174
+ unreachable ! ( "More than three bytes per UCS-2 character." ) ;
135
175
}
136
- }
137
176
138
- Ok ( i)
177
+ Ok ( ( ) )
178
+ } )
139
179
}
140
180
141
181
#[ cfg( test) ]
@@ -165,4 +205,28 @@ mod tests {
165
205
assert_eq ! ( result. unwrap( ) , 9 ) ;
166
206
assert_eq ! ( core:: str :: from_utf8( & u8_buffer[ 0 ..9 ] ) , Ok ( "$¢ह한" ) ) ;
167
207
}
208
+
209
+ #[ test]
210
+ fn decoding_with ( ) {
211
+ let input = "$¢ह한" ;
212
+
213
+ let mut u16_buffer = [ 0u16 ; 4 ] ;
214
+ let result = encode ( input, & mut u16_buffer) ;
215
+ assert_eq ! ( result. unwrap( ) , 4 ) ;
216
+
217
+ let mut u8_buffer = [ 0u8 ; 9 ] ;
218
+ let mut pos = 0 ;
219
+
220
+ let result = decode_with ( & u16_buffer, |bytes| {
221
+ for byte in bytes. into_iter ( ) {
222
+ u8_buffer[ pos] = * byte;
223
+ pos += 1 ;
224
+ }
225
+
226
+ Ok ( ( ) )
227
+ } ) ;
228
+
229
+ assert_eq ! ( result. unwrap( ) , 9 ) ;
230
+ assert_eq ! ( core:: str :: from_utf8( & u8_buffer[ 0 ..9 ] ) , Ok ( "$¢ह한" ) ) ;
231
+ }
168
232
}
0 commit comments