diff --git a/docs/CHANGES.TXT b/docs/CHANGES.TXT index 5042253ba..8b3576635 100644 --- a/docs/CHANGES.TXT +++ b/docs/CHANGES.TXT @@ -1,5 +1,6 @@ 1.0 (to be released) ----------------- +- New: Add Encoder Module to Rust - Fix: Elementary stream regressions - Fix: Segmentation faults on XDS files - Fix: Clippy Errors Based on Rust 1.88 diff --git a/src/lib_ccx/ccx_encoders_common.c b/src/lib_ccx/ccx_encoders_common.c index c2540dd10..2c77e3f90 100644 --- a/src/lib_ccx/ccx_encoders_common.c +++ b/src/lib_ccx/ccx_encoders_common.c @@ -19,6 +19,10 @@ int fsync(int fd) } #endif +#ifndef DISABLE_RUST +int ccxr_get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_subs, + enum ccx_encoding_type in_enc, enum ccx_encoding_type out_enc, int max_len); +#endif // These are the default settings for plain transcripts. No times, no CC or caption mode, and no XDS. ccx_encoders_transcript_format ccx_encoders_default_transcript_settings = { @@ -293,6 +297,9 @@ int change_ascii_encoding(unsigned char *dest, unsigned char *src, int len, enum int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_subs, enum ccx_encoding_type in_enc, enum ccx_encoding_type out_enc, int max_len) { +#ifndef DISABLE_RUST + return ccxr_get_str_basic(out_buffer, in_buffer, trim_subs, in_enc, out_enc, max_len); +#else int last_non_blank = -1; int first_non_blank = -1; int len = 0; @@ -305,7 +312,6 @@ int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_ *out_buffer = 0; return 0; } - // change encoding only when required switch (in_enc) { @@ -331,6 +337,7 @@ int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_ return (unsigned)len; // Return length return 0; // Return length +#endif } int write_subtitle_file_footer(struct encoder_ctx *ctx, struct ccx_s_write *out) @@ -631,8 +638,8 @@ int write_cc_buffer_as_simplexml(struct eia608_screen *data, struct encoder_ctx if (data->row_used[i]) { write_cc_line_as_simplexml(data, context, i); + wrote_something = 1; } - wrote_something = 1; } return wrote_something; } diff --git a/src/lib_ccx/ccx_encoders_spupng.c b/src/lib_ccx/ccx_encoders_spupng.c index 815c521ed..17da7803b 100644 --- a/src/lib_ccx/ccx_encoders_spupng.c +++ b/src/lib_ccx/ccx_encoders_spupng.c @@ -29,18 +29,6 @@ FT_Face face_regular = NULL; FT_Face face_italics = NULL; FT_Face face = NULL; -struct spupng_t -{ - FILE *fpxml; - FILE *fppng; - char *dirname; - char *pngfile; - char *relative_path_png; - int fileIndex; - int xOffset; - int yOffset; -}; - #define CCPL (ccfont2_width / CCW * ccfont2_height / CCH) static int initialized = 0; diff --git a/src/lib_ccx/ccx_encoders_structs.h b/src/lib_ccx/ccx_encoders_structs.h index 6c4d3cdb6..83adc509c 100644 --- a/src/lib_ccx/ccx_encoders_structs.h +++ b/src/lib_ccx/ccx_encoders_structs.h @@ -29,4 +29,16 @@ struct ccx_s_write }; +struct spupng_t +{ + FILE *fpxml; + FILE *fppng; + char *dirname; + char *pngfile; + char *relative_path_png; + int fileIndex; + int xOffset; + int yOffset; +}; + #endif diff --git a/src/rust/build.rs b/src/rust/build.rs index 482694e86..a2005dd65 100644 --- a/src/rust/build.rs +++ b/src/rust/build.rs @@ -12,6 +12,9 @@ fn main() { "writercwtdata", "version", "set_binary_mode", + "net_send_header", // shall be removed after NET + "write_spumux_footer", + "write_spumux_header", ]); #[cfg(feature = "hardsubx_ocr")] @@ -39,6 +42,7 @@ fn main() { "ccx_encoding_type", "ccx_decoder_608_settings", "ccx_decoder_608_report", + "eia608_screen", "uint8_t", "word_list", ]); diff --git a/src/rust/lib_ccxr/src/encoder/mod.rs b/src/rust/lib_ccxr/src/encoder/mod.rs new file mode 100644 index 000000000..8425cfb5a --- /dev/null +++ b/src/rust/lib_ccxr/src/encoder/mod.rs @@ -0,0 +1,4 @@ +pub mod txt_helpers; +/* Note +This is a part of the encoder library, which is made in pure rust; hence it's kept in `lib_ccxr` instead of `src`. + */ diff --git a/src/rust/lib_ccxr/src/encoder/txt_helpers.rs b/src/rust/lib_ccxr/src/encoder/txt_helpers.rs new file mode 100644 index 000000000..a1986b9ee --- /dev/null +++ b/src/rust/lib_ccxr/src/encoder/txt_helpers.rs @@ -0,0 +1,427 @@ +use crate::info; +use crate::util::encoding::*; +use std::cmp; +use std::convert::TryFrom; + +pub enum EncoderError { + Retry = -100, // CCX_EAGAIN + EOF = -101, // CCX_EOF + InvalidArgument = -102, // CCX_EINVAL + Unsupported = -103, // CCX_ENOSUPP + OutOfMemory = -104, // CCX_ENOMEM +} + +fn find_limit_characters( + line: &[u8], + first_non_blank: &mut i32, + last_non_blank: &mut i32, + max_len: usize, +) { + *first_non_blank = -1; + *last_non_blank = -1; + + let limit = cmp::min(line.len(), max_len); + + for (i, &c) in line.iter().take(limit).enumerate() { + if c == b'\0' || c == b'\n' || c == b'\r' { + break; + } + if c != b' ' && c != 0x89 { + if *first_non_blank < 0 { + *first_non_blank = i as i32; + } + *last_non_blank = i as i32; + } + } +} + +fn change_utf8_encoding(dest: &mut [u8], src: &[u8], len: i32, out_enc: Encoding) -> i32 { + let mut dest_idx = 0; + let mut src_idx = 0; + let max = usize::min(src.len(), len as usize); + + while src_idx < max { + let c = src[src_idx]; + let c_len: usize; + + if c < 0x80 { + c_len = 1; + } else if (c & 0x20) == 0 { + c_len = 2; + } else if (c & 0x10) == 0 { + c_len = 3; + } else if (c & 0x08) == 0 { + c_len = 4; + } else if (c & 0x04) == 0 { + c_len = 5; + } else { + c_len = 1; + } + + match out_enc { + Encoding::UTF8 => { + if max <= dest.len() { + dest[..max].copy_from_slice(&src[..max]); + return max as i32; + } else { + return EncoderError::Unsupported as i32; + } + } + Encoding::Latin1 => { + let cp = if c_len == 1 { + src[src_idx] as u32 + } else if c_len == 2 && src_idx + 1 < max && (src[src_idx + 1] & 0x40) == 0 { + (((src[src_idx] & 0x1F) as u32) << 6) | ((src[src_idx + 1] & 0x3F) as u32) + } else if c_len == 3 + && src_idx + 2 < max + && (src[src_idx + 1] & 0x40) == 0 + && (src[src_idx + 2] & 0x40) == 0 + { + (((src[src_idx] & 0x0F) as u32) << 12) + | (((src[src_idx + 1] & 0x3F) as u32) << 6) + | ((src[src_idx + 2] & 0x3F) as u32) + } else if c_len == 4 + && src_idx + 3 < max + && (src[src_idx + 1] & 0x40) == 0 + && (src[src_idx + 2] & 0x40) == 0 + && (src[src_idx + 3] & 0x40) == 0 + { + (((src[src_idx] & 0x07) as u32) << 18) + | (((src[src_idx + 1] & 0x3F) as u32) << 12) + | (((src[src_idx + 2] & 0x3F) as u32) << 6) + | ((src[src_idx + 3] & 0x3F) as u32) + } else if c_len == 5 + && src_idx + 4 < max + && (src[src_idx + 1] & 0x40) == 0 + && (src[src_idx + 2] & 0x40) == 0 + && (src[src_idx + 3] & 0x40) == 0 + && (src[src_idx + 4] & 0x40) == 0 + { + (((src[src_idx] & 0x03) as u32) << 24u32) + | (((src[src_idx + 1] & 0x3F) as u32) << 18u32) + | (((src[src_idx + 2] & 0x3F) as u32) << 12u32) + | (((src[src_idx + 3] & 0x3F) as u32) << 6u32) + | ((src[src_idx + 4] & 0x3F) as u32) + } else { + 0x3F + }; + + if c_len == 1 || cp == 0x3F { + dest[dest_idx] = if c_len == 1 { src[src_idx] } else { b'?' }; + } else { + let mapped_cp = utf8_to_latin1_map(cp) as u16; + dest[dest_idx] = if mapped_cp <= 255 { + mapped_cp as u8 + } else { + b'?' + }; + } + dest_idx += 1; + } + Encoding::UCS2 => { + return EncoderError::Unsupported as i32; + } + Encoding::Line21 => { + dest[dest_idx] = if c_len == 1 { src[src_idx] } else { b'?' }; + dest_idx += 1; + } + } + src_idx += c_len; + } + + if dest_idx < dest.len() { + dest[dest_idx] = 0; + } + dest_idx as i32 +} + +#[allow(unused_variables)] +fn change_latin1_encoding(dest: &mut [u8], src: &[u8], len: i32, out_enc: Encoding) -> i32 { + EncoderError::Unsupported as i32 +} + +#[allow(unused_variables)] +fn change_unicode_encoding(dest: &mut [u8], src: &[u8], len: i32, out_enc: Encoding) -> i32 { + EncoderError::Unsupported as i32 +} + +pub fn change_ascii_encoding( + dest: &mut Vec, + src: &[u8], + out_enc: Encoding, +) -> Result { + dest.clear(); + + for &c in src { + match out_enc { + Encoding::UTF8 => { + let utf8_char = line21_to_utf8(c); + let first_non_zero = (utf8_char.leading_zeros() / 8) as usize; + let bytes = utf8_char.to_be_bytes(); + let byte_count = bytes.len() - first_non_zero; + if byte_count == 0 || byte_count > 4 { + return Err(-1); + } + dest.extend_from_slice(&bytes[first_non_zero..(first_non_zero + byte_count)]); + } + Encoding::Latin1 => { + let latin1_char = line21_to_latin1(c); + dest.push(latin1_char); + } + Encoding::UCS2 => { + let ucs2_char = line21_to_ucs2(c); + dest.extend_from_slice(&ucs2_char.to_le_bytes()); + } + Encoding::Line21 => { + dest.extend_from_slice(src); + return Ok(src.len()); + } + } + } + + dest.push(0); + + Ok(dest.len() - 1) +} + +fn utf8_to_latin1_map(character: u32) -> Latin1Char { + ucs2_to_latin1(char_to_ucs2(char::try_from(character).unwrap())) +} + +pub fn get_str_basic( + out_buffer: &mut Vec, + in_buffer: &[u8], + trim_subs: bool, + in_enc: Encoding, + out_enc: Encoding, + max_len: i32, +) -> i32 { + let mut last_non_blank: i32 = -1; + let mut first_non_blank: i32 = -1; + let len; + + find_limit_characters( + in_buffer, + &mut first_non_blank, + &mut last_non_blank, + max_len as usize, + ); + + if first_non_blank == -1 { + out_buffer.clear(); + out_buffer.push(0); + return 0; + } + + let mut content_length = last_non_blank - first_non_blank + 1; + + if !trim_subs { + first_non_blank = 0; + content_length = last_non_blank + 1; + } + + if (first_non_blank + content_length) as usize > in_buffer.len() { + out_buffer.clear(); + out_buffer.push(0); + return 0; + } + + out_buffer.clear(); + + match in_enc { + Encoding::UTF8 => { + let mut temp_buffer = vec![0u8; (content_length * 4) as usize]; + len = change_utf8_encoding( + &mut temp_buffer, + &in_buffer[first_non_blank as usize..], + content_length, + out_enc, + ); + if len > 0 { + out_buffer.extend_from_slice(&temp_buffer[..len as usize]); + } + } + Encoding::Latin1 => { + let mut temp_buffer = vec![0u8; (content_length * 4) as usize]; + len = change_latin1_encoding( + &mut temp_buffer, + &in_buffer[first_non_blank as usize..], + content_length, + out_enc, + ); + if len > 0 { + out_buffer.extend_from_slice(&temp_buffer[..len as usize]); + } + } + Encoding::UCS2 => { + let mut temp_buffer = vec![0u8; (content_length * 4) as usize]; + len = change_unicode_encoding( + &mut temp_buffer, + &in_buffer[first_non_blank as usize..], + content_length, + out_enc, + ); + if len > 0 { + out_buffer.extend_from_slice(&temp_buffer[..len as usize]); + } + } + Encoding::Line21 => { + let input_slice = + &in_buffer[first_non_blank as usize..(first_non_blank + content_length) as usize]; + len = change_ascii_encoding(out_buffer, input_slice, out_enc) + .unwrap_or(EncoderError::Retry as usize) as i32; + } + } + + if len < 0 { + info!("WARNING: Could not encode in specified format\n"); + out_buffer.clear(); + out_buffer.push(0); + 0 + } else if len == EncoderError::Unsupported as i32 { + info!("WARNING: Encoding is not yet supported\n"); + out_buffer.clear(); + out_buffer.push(0); + return 0; + } else { + out_buffer.push(0); + return len; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_line_with_content() { + let line = b" hello world \n"; + let mut first_non_blank = 0; + let mut last_non_blank = 0; + + find_limit_characters(line, &mut first_non_blank, &mut last_non_blank, 15); + + assert_eq!(first_non_blank, 2); + assert_eq!(last_non_blank, 12); + } + + #[test] + fn test_line_with_special_chars() { + let line = b" \x89 abc \x89 def \r"; + let mut first_non_blank = 0; + let mut last_non_blank = 0; + + find_limit_characters(line, &mut first_non_blank, &mut last_non_blank, 20); + + assert_eq!(first_non_blank, 3); + } + #[test] + fn test_utf8_to_utf8() { + let src = b"Hello, \xC3\xA9world!"; + let mut dest = [0u8; 20]; + + let result = change_utf8_encoding(&mut dest, src, src.len() as i32, Encoding::UTF8); + + assert_eq!(result, src.len() as i32); + assert_eq!(&dest[..src.len()], src); + } + + #[test] + fn test_utf8_to_ascii() { + let src = b"Hello, \xC3\xA9world!"; + let mut dest = [0u8; 20]; + + let result = change_utf8_encoding(&mut dest, src, src.len() as i32, Encoding::Line21); + + assert_eq!(result, 14); + assert_eq!(&dest[..14], b"Hello, ?world!"); + } + + #[test] + fn test_unsupported_encoding() { + let src = b"Hello"; + let mut dest = [0u8; 10]; + + let result = change_utf8_encoding(&mut dest, src, src.len() as i32, Encoding::UCS2); + + assert_eq!(result, EncoderError::Unsupported as i32); + } + #[test] + fn test_ascii_to_ascii() { + let src = b"Hello World!"; + let mut dest = Vec::with_capacity(20); + let result = change_ascii_encoding(&mut dest, src, Encoding::Line21); + + assert_eq!(result.unwrap(), src.len()); + assert_eq!(&dest[..src.len()], src); + } + + #[test] + fn test_ascii_to_utf8() { + let src = b"Hello"; + let mut dest = Vec::with_capacity(20); + + let result = change_ascii_encoding(&mut dest, src, Encoding::UTF8); + + assert_eq!(result.unwrap(), 5); + assert_eq!(&dest[..5], b"Hello"); + assert_eq!(dest[5], 0); + } + + #[test] + fn test_ascii_to_latin1() { + let src = b"Test"; + let mut dest = Vec::with_capacity(20); + + let result = change_ascii_encoding(&mut dest, src, Encoding::Latin1); + + assert_eq!(result.unwrap(), 4); + assert_eq!(&dest[..4], b"Test"); + assert_eq!(dest[4], 0); + } + + #[test] + fn test_ascii_to_unicode() { + let src = b"Hi"; + let mut dest = Vec::with_capacity(20); + + let result = change_ascii_encoding(&mut dest, src, Encoding::UCS2); + + assert_eq!(result.unwrap(), 4); + assert_eq!(dest[4], 0); + } + + #[test] + fn test_get_str_basic_with_trim() { + let in_buffer = b" Hello \0"; + let mut out_buffer = Vec::with_capacity(20); + + let result = get_str_basic( + &mut out_buffer, + in_buffer, + true, + Encoding::Line21, + Encoding::Line21, + 10, + ); + + assert!(result > 0); + } + + #[test] + fn test_get_str_basic_without_trim() { + let in_buffer = b" Hello \0"; + let mut out_buffer = Vec::with_capacity(20); + + let result = get_str_basic( + &mut out_buffer, + in_buffer, + false, + Encoding::Line21, + Encoding::Line21, + 10, + ); + + assert!(result > 0); + } +} diff --git a/src/rust/lib_ccxr/src/lib.rs b/src/rust/lib_ccxr/src/lib.rs index 9f32678db..4b02a4dfa 100644 --- a/src/rust/lib_ccxr/src/lib.rs +++ b/src/rust/lib_ccxr/src/lib.rs @@ -1,5 +1,6 @@ pub mod activity; pub mod common; +pub mod encoder; pub mod hardsubx; pub mod subtitle; pub mod teletext; diff --git a/src/rust/lib_ccxr/src/util/encoding.rs b/src/rust/lib_ccxr/src/util/encoding.rs index 6f0a77a35..e81db037e 100644 --- a/src/rust/lib_ccxr/src/util/encoding.rs +++ b/src/rust/lib_ccxr/src/util/encoding.rs @@ -4,8 +4,8 @@ //! represented by [`Encoding`]. //! - [`Line 21`](Encoding::Line21) - Used in 608 captions. //! - [`Latin-1`](Encoding::Latin1) - ISO/IEC 8859-1. -//! - [`UCS-2`](Encoding::Ucs2) - UCS-2 code points. -//! - [`UTF-8`](Encoding::Utf8) +//! - [`UCS-2`](Encoding::UCS2) - UCS-2 code points. +//! - [`UTF-8`](Encoding::UTF8) //! //! To represent a string in any one of the above encoding, use the following respectively. //! - [`Line21String`] @@ -44,8 +44,8 @@ pub enum Encoding { Line21, // Same as `CCX_ENC_ASCII` in C Latin1, // Same as `CCX_ENC_LATIN_1` in C #[default] - Utf8, // Same as `CCX_ENC_UTF_8` in C - Ucs2, // Same as `CCX_ENC_UNICODE` in C + UTF8, // Same as `CCX_ENC_UTF_8` in C + UCS2, // Same as `CCX_ENC_UNICODE` in C } /// Represents a character in Line 21 encoding. @@ -56,6 +56,8 @@ pub type Latin1Char = u8; /// Represents a character in UCS-2 encoding. pub type Ucs2Char = u16; +/// Represents a character in UTF-8 encoding. +pub type Utf8Char = u32; /// A String-like type containing a sequence of Line 21 encoded characters. #[derive(Clone, Debug, Eq, PartialEq, Default)] @@ -115,8 +117,8 @@ impl Line21String { match encoding { Encoding::Line21 => self.clone().into(), Encoding::Latin1 => EncodedString::Latin1(self.into()), - Encoding::Ucs2 => EncodedString::Ucs2(self.into()), - Encoding::Utf8 => EncodedString::Utf8(self.into()), + Encoding::UCS2 => EncodedString::Ucs2(self.into()), + Encoding::UTF8 => EncodedString::Utf8(self.into()), } } @@ -172,8 +174,8 @@ impl Latin1String { match encoding { Encoding::Line21 => EncodedString::Line21(self.into()), Encoding::Latin1 => self.clone().into(), - Encoding::Ucs2 => EncodedString::Ucs2(self.into()), - Encoding::Utf8 => EncodedString::Utf8(self.into()), + Encoding::UCS2 => EncodedString::Ucs2(self.into()), + Encoding::UTF8 => EncodedString::Utf8(self.into()), } } @@ -229,8 +231,8 @@ impl Ucs2String { match encoding { Encoding::Line21 => EncodedString::Line21(self.into()), Encoding::Latin1 => EncodedString::Latin1(self.into()), - Encoding::Ucs2 => self.clone().into(), - Encoding::Utf8 => EncodedString::Utf8(self.into()), + Encoding::UCS2 => self.clone().into(), + Encoding::UTF8 => EncodedString::Utf8(self.into()), } } @@ -339,7 +341,7 @@ impl From<&str> for Ucs2String { impl From<&Line21String> for String { fn from(value: &Line21String) -> String { - value.as_vec().iter().map(|&c| line21_to_utf8(c)).collect() + value.as_vec().iter().map(|&c| line21_to_char(c)).collect() } } @@ -372,8 +374,8 @@ impl EncodedString { match encoding { Encoding::Line21 => EncodedString::Line21(string.into()), Encoding::Latin1 => EncodedString::Latin1(string.into()), - Encoding::Ucs2 => EncodedString::Ucs2(string.into()), - Encoding::Utf8 => EncodedString::Utf8(string.to_string()), + Encoding::UCS2 => EncodedString::Ucs2(string.into()), + Encoding::UTF8 => EncodedString::Utf8(string.to_string()), } } @@ -389,8 +391,8 @@ impl EncodedString { match self { EncodedString::Line21(_) => Encoding::Line21, EncodedString::Latin1(_) => Encoding::Latin1, - EncodedString::Ucs2(_) => Encoding::Ucs2, - EncodedString::Utf8(_) => Encoding::Utf8, + EncodedString::Ucs2(_) => Encoding::UCS2, + EncodedString::Utf8(_) => Encoding::UTF8, } } @@ -399,7 +401,7 @@ impl EncodedString { /// # Examples /// ```rust /// # use lib_ccxr::util::encoding::*; - /// let s = EncodedString::from_str("Hi 😀", Encoding::Ucs2); + /// let s = EncodedString::from_str("Hi 😀", Encoding::UCS2); /// assert_eq!( /// s.to_line21(), /// Line21String::from_vec( @@ -421,7 +423,7 @@ impl EncodedString { /// # Examples /// ```rust /// # use lib_ccxr::util::encoding::*; - /// let s = EncodedString::from_str("résumé", Encoding::Utf8); + /// let s = EncodedString::from_str("résumé", Encoding::UTF8); /// assert_eq!( /// s.to_latin1(), /// Latin1String::from_vec( @@ -480,20 +482,20 @@ impl EncodedString { } /// Converts this [`EncodedString`] to a format provided by `encoding`, returning a new [`EncodedString`]. - /// + /// /// # Examples /// ```rust /// # use lib_ccxr::util::encoding::*; /// let v = vec![0x72, 0x5c, 0x73, 0x75, 0x6d, 0x5c]; // résumé in Line 21 encoding /// let s: EncodedString = Line21String::from_vec(v).into(); - /// assert_eq!(s.encode_to(Encoding::Utf8), "résumé".to_string().into()) + /// assert_eq!(s.encode_to(Encoding::UTF8), "résumé".to_string().into()) /// ``` pub fn encode_to(&self, encoding: Encoding) -> EncodedString { match encoding { Encoding::Line21 => EncodedString::Line21(self.to_line21()), Encoding::Latin1 => EncodedString::Latin1(self.to_latin1()), - Encoding::Ucs2 => EncodedString::Ucs2(self.to_ucs2()), - Encoding::Utf8 => EncodedString::Utf8(self.to_utf8()), + Encoding::UCS2 => EncodedString::Ucs2(self.to_ucs2()), + Encoding::UTF8 => EncodedString::Utf8(self.to_utf8()), } } @@ -653,7 +655,7 @@ fn latin1_to_line21(c: Latin1Char) -> Line21Char { } } -fn line21_to_latin1(c: Line21Char) -> Latin1Char { +pub fn line21_to_latin1(c: Line21Char) -> Latin1Char { if c < 0x80 { // Regular line-21 character set, mostly ASCII except these exceptions match c { @@ -764,118 +766,139 @@ fn line21_to_latin1(c: Line21Char) -> Latin1Char { } } -fn line21_to_utf8(c: Line21Char) -> char { +pub fn line21_to_utf8(c: Line21Char) -> Utf8Char { if c < 0x80 { // Regular line-21 character set, mostly ASCII except these exceptions match c { - 0x2a => 0xe1 as char, // lowercase a, acute accent - 0x5c => 0xe9 as char, // lowercase e, acute accent - 0x5e => 0xed as char, // lowercase i, acute accent - 0x5f => 0xf3 as char, // lowercase o, acute accent - 0x60 => 0xfa as char, // lowercase u, acute accent - 0x7b => 0xe7 as char, // lowercase c with cedilla - 0x7c => 0xf7 as char, // division symbol - 0x7d => 0xd1 as char, // uppercase N tilde - 0x7e => 0xf1 as char, // lowercase n tilde - 0x7f => '■', // Solid block - _ => c as char, + 0x2a => 0xc3a1, // lowercase a, acute accent + 0x5c => 0xc3a9, // lowercase e, acute accent + 0x5e => 0xc3ad, // lowercase i, acute accent + 0x5f => 0xc3b3, // lowercase o, acute accent + 0x60 => 0xc3ba, // lowercase u, acute accent + 0x7b => 0xc3a7, // lowercase c with cedilla + 0x7c => 0xc3b7, // division symbol + 0x7d => 0xc391, // uppercase N tilde + 0x7e => 0xc3b1, // lowercase n tilde + 0x7f => 0xe296a0, // Solid block + _ => c as u32, // Default: regular ASCII } } else { match c { // THIS BLOCK INCLUDES THE 16 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS // THAT COME FROM HI BYTE=0x11 AND LOW BETWEEN 0x30 AND 0x3F - 0x80 => 0xae as char, // Registered symbol (R) - 0x81 => 0xb0 as char, // degree sign - 0x82 => 0xbd as char, // 1/2 symbol - 0x83 => 0xbf as char, // Inverted (open) question mark - 0x84 => '™', // Trademark symbol (TM) - 0x85 => 0xa2 as char, // Cents symbol - 0x86 => 0xa3 as char, // Pounds sterling - 0x87 => 0xb6 as char, // Music note - Not in latin 1, so we use 'pilcrow' - 0x88 => 0xe0 as char, // lowercase a, grave accent - 0x89 => 0x20 as char, // transparent space, we make it regular - 0x8a => 0xe8 as char, // lowercase e, grave accent - 0x8b => 0xe2 as char, // lowercase a, circumflex accent - 0x8c => 0xea as char, // lowercase e, circumflex accent - 0x8d => 0xee as char, // lowercase i, circumflex accent - 0x8e => 0xf4 as char, // lowercase o, circumflex accent - 0x8f => 0xfb as char, // lowercase u, circumflex accent + 0x80 => 0xc2ae, // Registered symbol (R) + 0x81 => 0xc2b0, // degree sign + 0x82 => 0xc2bd, // 1/2 symbol + 0x83 => 0xc2bf, // Inverted (open) question mark + 0x84 => 0xe284a2, // Trademark symbol (TM) + 0x85 => 0xc2a2, // Cents symbol + 0x86 => 0xc2a3, // Pounds sterling + 0x87 => 0xe299aa, // Music note + 0x88 => 0xc3a0, // lowercase a, grave accent + 0x89 => 0x20, // transparent space, we make it regular + 0x8a => 0xc3a8, // lowercase e, grave accent + 0x8b => 0xc3a2, // lowercase a, circumflex accent + 0x8c => 0xc3aa, // lowercase e, circumflex accent + 0x8d => 0xc3ae, // lowercase i, circumflex accent + 0x8e => 0xc3b4, // lowercase o, circumflex accent + 0x8f => 0xc3bb, // lowercase u, circumflex accent // THIS BLOCK INCLUDES THE 32 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS // THAT COME FROM HI BYTE=0x12 AND LOW BETWEEN 0x20 AND 0x3F - 0x90 => 0xc1 as char, // capital letter A with acute - 0x91 => 0xc9 as char, // capital letter E with acute - 0x92 => 0xd3 as char, // capital letter O with acute - 0x93 => 0xda as char, // capital letter U with acute - 0x94 => 0xdc as char, // capital letter U with diaeresis - 0x95 => 0xfc as char, // lowercase letter U with diaeresis - 0x96 => 0x27 as char, // apostrophe - 0x97 => 0xa1 as char, // inverted exclamation mark - 0x98 => 0x2a as char, // asterisk - 0x99 => 0x27 as char, // apostrophe (yes, duped). See CCADI source code. - 0x9a => 0x2d as char, // em dash - 0x9b => 0xa9 as char, // copyright sign - 0x9c => '℠', // Service Mark - 0x9d => 0x2e as char, // Full stop (.) - 0x9e => 0x22 as char, // Quotation mark - 0x9f => 0x22 as char, // Quotation mark - 0xa0 => 0xc0 as char, // uppercase A, grave accent - 0xa1 => 0xc2 as char, // uppercase A, circumflex - 0xa2 => 0xc7 as char, // uppercase C with cedilla - 0xa3 => 0xc8 as char, // uppercase E, grave accent - 0xa4 => 0xca as char, // uppercase E, circumflex - 0xa5 => 0xcb as char, // capital letter E with diaeresis - 0xa6 => 0xeb as char, // lowercase letter e with diaeresis - 0xa7 => 0xce as char, // uppercase I, circumflex - 0xa8 => 0xcf as char, // uppercase I, with diaeresis - 0xa9 => 0xef as char, // lowercase i, with diaeresis - 0xaa => 0xd4 as char, // uppercase O, circumflex - 0xab => 0xd9 as char, // uppercase U, grave accent - 0xac => 0xf9 as char, // lowercase u, grave accent - 0xad => 0xdb as char, // uppercase U, circumflex - 0xae => 0xab as char, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - 0xaf => 0xbb as char, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + 0x90 => 0xc381, // capital letter A with acute + 0x91 => 0xc389, // capital letter E with acute + 0x92 => 0xc393, // capital letter O with acute + 0x93 => 0xc39a, // capital letter U with acute + 0x94 => 0xc39c, // capital letter U with diaeresis + 0x95 => 0xc3bc, // lowercase letter U with diaeresis + 0x96 => 0x27, // apostrophe + 0x97 => 0xc2a1, // inverted exclamation mark + 0x98 => 0x2a, // asterisk + 0x99 => 0x27, // Plain single quote + 0x9a => 0xe28094, // em dash + 0x9b => 0xc2a9, // copyright sign + 0x9c => 0xe284a0, // Service mark + 0x9d => 0xe280a2, // Round bullet + 0x9e => 0xe2809c, // Opening double quotes + 0x9f => 0xe2809d, // Closing double quotes + 0xa0 => 0xc380, // uppercase A, grave accent + 0xa1 => 0xc382, // uppercase A, circumflex + 0xa2 => 0xc387, // uppercase C with cedilla + 0xa3 => 0xc388, // uppercase E, grave accent + 0xa4 => 0xc38a, // uppercase E, circumflex + 0xa5 => 0xc38b, // capital letter E with diaeresis + 0xa6 => 0xc3ab, // lowercase letter e with diaeresis + 0xa7 => 0xc38e, // uppercase I, circumflex + 0xa8 => 0xc38f, // uppercase I, with diaeresis + 0xa9 => 0xc3af, // lowercase i, with diaeresis + 0xaa => 0xc394, // uppercase O, circumflex + 0xab => 0xc399, // uppercase U, grave accent + 0xac => 0xc3b9, // lowercase u, grave accent + 0xad => 0xc39b, // uppercase U, circumflex + 0xae => 0xc2ab, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + 0xaf => 0xc2bb, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + // THIS BLOCK INCLUDES THE 32 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS // THAT COME FROM HI BYTE=0x13 AND LOW BETWEEN 0x20 AND 0x3F - 0xb0 => 0xc3 as char, // Uppercase A, tilde - 0xb1 => 0xe3 as char, // Lowercase a, tilde - 0xb2 => 0xcd as char, // Uppercase I, acute accent - 0xb3 => 0xcc as char, // Uppercase I, grave accent - 0xb4 => 0xec as char, // Lowercase i, grave accent - 0xb5 => 0xd2 as char, // Uppercase O, grave accent - 0xb6 => 0xf2 as char, // Lowercase o, grave accent - 0xb7 => 0xd5 as char, // Uppercase O, tilde - 0xb8 => 0xf5 as char, // Lowercase o, tilde - 0xb9 => 0x7b as char, // Open curly brace - 0xba => 0x7d as char, // Closing curly brace - 0xbb => 0x5c as char, // Backslash - 0xbc => 0x5e as char, // Caret - 0xbd => 0x5f as char, // Underscore - 0xbe => 0xa6 as char, // Pipe (broken bar) - 0xbf => 0x7e as char, // Tilde - 0xc0 => 0xc4 as char, // Uppercase A, umlaut - 0xc1 => 0xe3 as char, // Lowercase A, umlaut - 0xc2 => 0xd6 as char, // Uppercase O, umlaut - 0xc3 => 0xf6 as char, // Lowercase o, umlaut - 0xc4 => 0xdf as char, // Eszett (sharp S) - 0xc5 => 0xa5 as char, // Yen symbol - 0xc6 => 0xa4 as char, // Currency symbol - 0xc7 => 0x7c as char, // Vertical bar - 0xc8 => 0xc5 as char, // Uppercase A, ring - 0xc9 => 0xe5 as char, // Lowercase A, ring - 0xca => 0xd8 as char, // Uppercase O, slash - 0xcb => 0xf8 as char, // Lowercase o, slash - 0xcc => '⌜', // Top left corner - 0xcd => '⌝', // Top right corner - 0xce => '⌞', // Bottom left corner - 0xcf => '⌟', // Bottom right corner - _ => UNAVAILABLE_CHAR as char, // For those that don't have representation - // I'll do it eventually, I promise - // This are weird chars anyway + 0xb0 => 0xc383, // Uppercase A, tilde + 0xb1 => 0xc3a3, // Lowercase a, tilde + 0xb2 => 0xc38d, // Uppercase I, acute accent + 0xb3 => 0xc38c, // Uppercase I, grave accent + 0xb4 => 0xc3ac, // Lowercase i, grave accent + 0xb5 => 0xc392, // Uppercase O, grave accent + 0xb6 => 0xc3b2, // Lowercase o, grave accent + 0xb7 => 0xc395, // Uppercase O, tilde + 0xb8 => 0xc3b5, // Lowercase o, tilde + 0xb9 => 0x7b, // Open curly brace + 0xba => 0x7d, // Closing curly brace + 0xbb => 0x5c, // Backslash + 0xbc => 0x5e, // Caret + 0xbd => 0x5f, // Underscore + 0xbe => 0xc2a6, // Pipe (broken bar) + 0xbf => 0x7e, // Tilde + 0xc0 => 0xc384, // Uppercase A, umlaut + 0xc1 => 0xc3a4, // Lowercase A, umlaut + 0xc2 => 0xc396, // Uppercase O, umlaut + 0xc3 => 0xc3b6, // Lowercase o, umlaut + 0xc4 => 0xc39f, // Esszett (sharp S) + 0xc5 => 0xc2a5, // Yen symbol + 0xc6 => 0xc2a4, // Currency symbol + 0xc7 => 0x7c, // Vertical bar + 0xc8 => 0xc385, // Uppercase A, ring + 0xc9 => 0xc3a5, // Lowercase A, ring + 0xca => 0xc398, // Uppercase O, slash + 0xcb => 0xc3b8, // Lowercase o, slash + 0xcc => 0xe28c9c, // Top left corner + 0xcd => 0xe28c9d, // Top right corner + 0xce => 0xe28c9e, // Bottom left corner + 0xcf => 0xe28c9f, // Bottom right corner + _ => b'?' as u32, // I'll do it eventually, I promise + // This are weird chars anyway } } } +pub fn line21_to_char(c: Line21Char) -> char { + let utf8_packed = line21_to_utf8(c); + + let bytes = if utf8_packed <= 0xff { + vec![utf8_packed as u8] + } else if utf8_packed <= 0xffff { + vec![(utf8_packed >> 8) as u8, utf8_packed as u8] + } else if utf8_packed <= 0xffffff { + vec![ + (utf8_packed >> 16) as u8, + (utf8_packed >> 8) as u8, + utf8_packed as u8, + ] + } else { + return '?'; + }; -fn line21_to_ucs2(c: Line21Char) -> Ucs2Char { + std::str::from_utf8(&bytes) + .ok() + .and_then(|s| s.chars().next()) + .unwrap_or('?') +} +pub fn line21_to_ucs2(c: Line21Char) -> Ucs2Char { match c { 0x7f => 0x25A0, // Solid block 0x84 => 0x2122, // Trademark symbol (TM) @@ -905,7 +928,7 @@ fn ucs2_to_line21(c: Ucs2Char) -> Line21Char { } } -fn ucs2_to_latin1(c: Ucs2Char) -> Latin1Char { +pub fn ucs2_to_latin1(c: Ucs2Char) -> Latin1Char { // Code points 0 to U+00FF are the same in both. if c < 0xff { c as u8 @@ -1004,6 +1027,6 @@ fn ucs2_to_char(c: Ucs2Char) -> char { char::from_u32(x).unwrap_or(UNAVAILABLE_CHAR.into()) } -fn char_to_ucs2(c: char) -> Ucs2Char { +pub fn char_to_ucs2(c: char) -> Ucs2Char { (c as u32).try_into().unwrap_or(UNAVAILABLE_CHAR.into()) } diff --git a/src/rust/src/common.rs b/src/rust/src/common.rs index b182c5020..4de6dd7e0 100644 --- a/src/rust/src/common.rs +++ b/src/rust/src/common.rs @@ -352,8 +352,8 @@ impl CType for Encoding { match self { Encoding::Line21 => ccx_encoding_type_CCX_ENC_ASCII as _, Encoding::Latin1 => ccx_encoding_type_CCX_ENC_LATIN_1 as _, - Encoding::Utf8 => ccx_encoding_type_CCX_ENC_UTF_8 as _, - Encoding::Ucs2 => ccx_encoding_type_CCX_ENC_UNICODE as _, + Encoding::UTF8 => ccx_encoding_type_CCX_ENC_UTF_8 as _, + Encoding::UCS2 => ccx_encoding_type_CCX_ENC_UNICODE as _, } } } diff --git a/src/rust/src/encoder/common.rs b/src/rust/src/encoder/common.rs new file mode 100644 index 000000000..91d2b163e --- /dev/null +++ b/src/rust/src/encoder/common.rs @@ -0,0 +1,420 @@ +#![allow(dead_code)] +use crate::bindings::{ + ccx_encoding_type_CCX_ENC_UNICODE, ccx_s_write, encoder_ctx, net_send_header, + write_spumux_footer, write_spumux_header, +}; +use crate::ccx_options; +use crate::encoder::FromCType; +use lib_ccxr::common::{OutputFormat, BROADCAST_HEADER, LITTLE_ENDIAN_BOM, UTF8_BOM}; +use lib_ccxr::util::encoding::Encoding; +use lib_ccxr::util::log::DebugMessageFlag; +use lib_ccxr::{debug, info}; +use std::alloc::{alloc, dealloc, Layout}; +use std::fs::File; +use std::io::Write; +#[cfg(unix)] +use std::os::fd::FromRawFd; +use std::os::raw::{c_int, c_uchar, c_uint, c_void}; +#[cfg(windows)] +use std::os::windows::io::FromRawHandle; +use std::ptr; + +const CCD_HEADER: &[u8] = b"SCC_disassembly V1.2"; +const SCC_HEADER: &[u8] = b"Scenarist_SCC V1.0"; + +const SSA_HEADER: &str = "[Script Info]\n\ +Title: Default file\n\ +ScriptType: v4.00+\n\ +\n\ +[V4+ Styles]\n\ +Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n\ +Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,0\n\ +\n\ +[Events]\n\ +Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\ +\n"; +const SAMI_HEADER: &str = "\n\ +\n\ +\n\ +\n\n\ +\n"; +const SMPTETT_HEADER: &str = "\n \n \n \n