Skip to content

Commit 77106cd

Browse files
author
Julius de Bruijn
authored
Add decode_with function. (#8)
1 parent f3fb79c commit 77106cd

File tree

1 file changed

+84
-20
lines changed

1 file changed

+84
-20
lines changed

src/lib.rs

Lines changed: 84 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,45 @@ where
9292
Ok(())
9393
}
9494

95+
/// Decode UCS-2 string to UTF-8 with a custom callback function.
96+
///
97+
/// `output` is a function which receives every decoded character.
98+
pub fn decode_with<F>(input: &[u16], mut output: F) -> Result<usize>
99+
where
100+
F: FnMut(&[u8]) -> Result<()>,
101+
{
102+
let mut written = 0;
103+
104+
for ch in input.iter() {
105+
/*
106+
* We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
107+
* the Basic Multilingual Plane, a maximum of three bytes are needed.
108+
*/
109+
if (0x000..0x0080).contains(ch) {
110+
output(&[*ch as u8])?;
111+
112+
written += 1;
113+
} else if (0x0080..0x0800).contains(ch) {
114+
let first = 0b1100_0000 + ch.get_bits(6..11) as u8;
115+
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;
116+
117+
output(&[first, last])?;
118+
119+
written += 2;
120+
} else {
121+
let first = 0b1110_0000 + ch.get_bits(12..16) as u8;
122+
let mid = 0b1000_0000 + ch.get_bits(6..12) as u8;
123+
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;
124+
125+
output(&[first, mid, last])?;
126+
127+
written += 3;
128+
}
129+
}
130+
131+
Ok(written)
132+
}
133+
95134
/// Decode an input UCS-2 string into a UTF-8 string.
96135
///
97136
/// The returned `usize` represents the length of the returned buffer,
@@ -100,42 +139,43 @@ pub fn decode(input: &[u16], output: &mut [u8]) -> Result<usize> {
100139
let buffer_size = output.len();
101140
let mut i = 0;
102141

103-
for &ch in input.iter() {
104-
/*
105-
* We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
106-
* the Basic Multilingual Plane, a maximum of three bytes are needed.
107-
*/
108-
if (0x0000..0x0080).contains(&ch) {
142+
decode_with(input, |bytes| {
143+
if bytes.len() == 1 {
109144
// Can be encoded in a single byte
110145
if i >= buffer_size {
111146
return Err(Error::BufferOverflow);
112147
}
113148

114-
output[i] = ch as u8;
149+
output[i] = bytes[0];
150+
115151
i += 1;
116-
} else if (0x0080..0x0800).contains(&ch) {
117-
// Can be encoded as two bytes
118-
if (i + 1) >= buffer_size {
152+
} else if bytes.len() == 2 {
153+
// Can be encoded two bytes
154+
if i + 1 >= buffer_size {
119155
return Err(Error::BufferOverflow);
120156
}
121157

122-
output[i] = 0b1100_0000 + ch.get_bits(6..11) as u8;
123-
output[i + 1] = 0b1000_0000 + ch.get_bits(0..6) as u8;
158+
output[i] = bytes[0];
159+
output[i + 1] = bytes[1];
160+
124161
i += 2;
125-
} else {
126-
// Can be encoded as three bytes
127-
if (i + 2) >= buffer_size {
162+
} else if bytes.len() == 3 {
163+
// Can be encoded three bytes
164+
if i + 2 >= buffer_size {
128165
return Err(Error::BufferOverflow);
129166
}
130167

131-
output[i] = 0b1110_0000 + ch.get_bits(12..16) as u8;
132-
output[i + 1] = 0b1000_0000 + ch.get_bits(6..12) as u8;
133-
output[i + 2] = 0b1000_0000 + ch.get_bits(0..6) as u8;
168+
output[i] = bytes[0];
169+
output[i + 1] = bytes[1];
170+
output[i + 2] = bytes[2];
171+
134172
i += 3;
173+
} else {
174+
unreachable!("More than three bytes per UCS-2 character.");
135175
}
136-
}
137176

138-
Ok(i)
177+
Ok(())
178+
})
139179
}
140180

141181
#[cfg(test)]
@@ -165,4 +205,28 @@ mod tests {
165205
assert_eq!(result.unwrap(), 9);
166206
assert_eq!(core::str::from_utf8(&u8_buffer[0..9]), Ok("$¢ह한"));
167207
}
208+
209+
#[test]
210+
fn decoding_with() {
211+
let input = "$¢ह한";
212+
213+
let mut u16_buffer = [0u16; 4];
214+
let result = encode(input, &mut u16_buffer);
215+
assert_eq!(result.unwrap(), 4);
216+
217+
let mut u8_buffer = [0u8; 9];
218+
let mut pos = 0;
219+
220+
let result = decode_with(&u16_buffer, |bytes| {
221+
for byte in bytes.into_iter() {
222+
u8_buffer[pos] = *byte;
223+
pos += 1;
224+
}
225+
226+
Ok(())
227+
});
228+
229+
assert_eq!(result.unwrap(), 9);
230+
assert_eq!(core::str::from_utf8(&u8_buffer[0..9]), Ok("$¢ह한"));
231+
}
168232
}

0 commit comments

Comments
 (0)