Skip to content

Commit cead90b

Browse files
committed
lib: move code to encoding module
In a next step, we can add more modules.
1 parent aa83752 commit cead90b

File tree

3 files changed

+225
-222
lines changed

3 files changed

+225
-222
lines changed

src/macros.rs renamed to src/encoding/macros.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::{ucs2_from_utf8_at_offset, Error};
1+
use super::{ucs2_from_utf8_at_offset, Error};
22

33
/// Count the number of UCS-2 characters in a string. Return an error if
44
/// the string cannot be encoded in UCS-2.

src/encoding/mod.rs

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
//! Low-level encoding and decoding facilities for UCS-2 strings.
2+
3+
mod macros;
4+
5+
/// These need to be public for the `ucs2_cstr!` macro, but are not
6+
/// intended to be called directly.
7+
#[doc(hidden)]
8+
pub use macros::{str_num_ucs2_chars, str_to_ucs2};
9+
10+
use bit_field::BitField;
11+
use core::fmt::{self, Display, Formatter};
12+
13+
/// Possible errors when encoding UCS-2 strings..
14+
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
15+
pub enum Error {
16+
/// Not enough space left in the output buffer.
17+
BufferOverflow,
18+
/// Input contained a character which cannot be represented in UCS-2.
19+
MultiByte,
20+
}
21+
22+
impl Display for Error {
23+
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
24+
match self {
25+
Self::BufferOverflow => f.write_str("output buffer is too small"),
26+
Self::MultiByte => {
27+
f.write_str("input contains a character which cannot be represented in UCS-2")
28+
}
29+
}
30+
}
31+
}
32+
33+
type Result<T> = core::result::Result<T, Error>;
34+
35+
/// Value returned by `ucs2_from_utf8_at_offset`.
36+
struct Ucs2CharFromUtf8 {
37+
/// UCS-2 character.
38+
val: u16,
39+
/// Number of bytes needed to encode the character in UTF-8.
40+
num_bytes: u8,
41+
}
42+
43+
/// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
44+
///
45+
/// # Safety
46+
///
47+
/// The input `bytes` must be valid UTF-8.
48+
const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8> {
49+
let len = bytes.len();
50+
let ch;
51+
let ch_len;
52+
53+
if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
54+
ch = bytes[offset] as u16;
55+
ch_len = 1;
56+
} else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
57+
// 2 byte codepoint
58+
if offset + 1 >= len {
59+
// safe: len is the length of bytes,
60+
// and bytes is a direct view into the
61+
// buffer of input, which in order to be a valid
62+
// utf-8 string _must_ contain `i + 1`.
63+
unsafe { core::hint::unreachable_unchecked() }
64+
}
65+
66+
let a = (bytes[offset] & 0b0001_1111) as u16;
67+
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
68+
ch = a << 6 | b;
69+
ch_len = 2;
70+
} else if bytes[offset] & 0b1111_0000 == 0b1110_0000 {
71+
// 3 byte codepoint
72+
if offset + 2 >= len || offset + 1 >= len {
73+
// safe: impossible utf-8 string.
74+
unsafe { core::hint::unreachable_unchecked() }
75+
}
76+
77+
let a = (bytes[offset] & 0b0000_1111) as u16;
78+
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
79+
let c = (bytes[offset + 2] & 0b0011_1111) as u16;
80+
ch = a << 12 | b << 6 | c;
81+
ch_len = 3;
82+
} else if bytes[offset] & 0b1111_0000 == 0b1111_0000 {
83+
return Err(Error::MultiByte); // UTF-16
84+
} else {
85+
// safe: impossible utf-8 string.
86+
unsafe { core::hint::unreachable_unchecked() }
87+
}
88+
89+
Ok(Ucs2CharFromUtf8 {
90+
val: ch,
91+
num_bytes: ch_len,
92+
})
93+
}
94+
95+
/// Encodes an input UTF-8 string into a UCS-2 string.
96+
///
97+
/// The returned `usize` represents the length of the returned buffer,
98+
/// measured in 2-byte characters.
99+
pub fn encode(input: &str, buffer: &mut [u16]) -> Result<usize> {
100+
let buffer_size = buffer.len();
101+
let mut i = 0;
102+
103+
encode_with(input, |ch| {
104+
if i >= buffer_size {
105+
Err(Error::BufferOverflow)
106+
} else {
107+
buffer[i] = ch;
108+
i += 1;
109+
Ok(())
110+
}
111+
})?;
112+
113+
Ok(i)
114+
}
115+
116+
/// Encode UTF-8 string to UCS-2 with a custom callback function.
117+
///
118+
/// `output` is a function which receives every encoded character.
119+
pub fn encode_with<F>(input: &str, mut output: F) -> Result<()>
120+
where
121+
F: FnMut(u16) -> Result<()>,
122+
{
123+
let bytes = input.as_bytes();
124+
let len = bytes.len();
125+
let mut i = 0;
126+
127+
while i < len {
128+
// SAFETY: `bytes` is valid UTF-8.
129+
let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?;
130+
i += usize::from(ch.num_bytes);
131+
output(ch.val)?;
132+
}
133+
Ok(())
134+
}
135+
136+
/// Decode UCS-2 string to UTF-8 with a custom callback function.
137+
///
138+
/// `output` is a function which receives every decoded character.
139+
/// Due to the nature of UCS-2, the function can receive an UTF-8 character
140+
/// of up to three bytes, for every input character.
141+
pub fn decode_with<F>(input: &[u16], mut output: F) -> Result<usize>
142+
where
143+
F: FnMut(&[u8]) -> Result<()>,
144+
{
145+
let mut written = 0;
146+
147+
for ch in input.iter() {
148+
/*
149+
* We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
150+
* the Basic Multilingual Plane, a maximum of three bytes are needed.
151+
*/
152+
if (0x000..0x0080).contains(ch) {
153+
output(&[*ch as u8])?;
154+
155+
written += 1;
156+
} else if (0x0080..0x0800).contains(ch) {
157+
let first = 0b1100_0000 + ch.get_bits(6..11) as u8;
158+
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;
159+
160+
output(&[first, last])?;
161+
162+
written += 2;
163+
} else {
164+
let first = 0b1110_0000 + ch.get_bits(12..16) as u8;
165+
let mid = 0b1000_0000 + ch.get_bits(6..12) as u8;
166+
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;
167+
168+
output(&[first, mid, last])?;
169+
170+
written += 3;
171+
}
172+
}
173+
174+
Ok(written)
175+
}
176+
177+
/// Decode an input UCS-2 string into a UTF-8 string.
178+
///
179+
/// The returned `usize` represents the length of the returned buffer,
180+
/// in bytes. Due to the nature of UCS-2, the output buffer could end up with
181+
/// three bytes for every character in the input buffer.
182+
pub fn decode(input: &[u16], output: &mut [u8]) -> Result<usize> {
183+
let buffer_size = output.len();
184+
let mut i = 0;
185+
186+
decode_with(input, |bytes| {
187+
if bytes.len() == 1 {
188+
// Can be encoded in a single byte
189+
if i >= buffer_size {
190+
return Err(Error::BufferOverflow);
191+
}
192+
193+
output[i] = bytes[0];
194+
195+
i += 1;
196+
} else if bytes.len() == 2 {
197+
// Can be encoded two bytes
198+
if i + 1 >= buffer_size {
199+
return Err(Error::BufferOverflow);
200+
}
201+
202+
output[i] = bytes[0];
203+
output[i + 1] = bytes[1];
204+
205+
i += 2;
206+
} else if bytes.len() == 3 {
207+
// Can be encoded three bytes
208+
if i + 2 >= buffer_size {
209+
return Err(Error::BufferOverflow);
210+
}
211+
212+
output[i] = bytes[0];
213+
output[i + 1] = bytes[1];
214+
output[i + 2] = bytes[2];
215+
216+
i += 3;
217+
} else {
218+
unreachable!("More than three bytes per UCS-2 character.");
219+
}
220+
221+
Ok(())
222+
})
223+
}

0 commit comments

Comments
 (0)