@@ -85,14 +85,41 @@ impl<'a> Writer<'a> {
8585
8686/// Write the symbol to the provided buffer
8787///
88- /// Always writes 2 bytes for consistent UTF-16BE encoding.
89- /// Previously, this function wrote 1 byte for ASCII characters and 2 bytes
90- /// for non-ASCII, creating an invalid mix that encoding conversion couldn't
91- /// handle properly. This caused garbled output with Japanese/Chinese characters
92- /// (issue #1451).
93- pub fn write_char ( sym : & dtvcc_symbol , buf : & mut Vec < u8 > ) {
94- buf. push ( ( sym. sym >> 8 ) as u8 ) ;
95- buf. push ( ( sym. sym & 0xff ) as u8 ) ;
88+ /// The `use_utf16` parameter controls the output format:
89+ /// - `true`: Always writes 2 bytes (UTF-16BE format). Use for UTF-16/UCS-2 charsets.
90+ /// - `false`: Writes 1 byte for ASCII (high byte == 0), 2 bytes for extended chars.
91+ /// Use for variable-width encodings like EUC-KR, CP949, Shift-JIS, etc.
92+ ///
93+ /// Issue #1451: Japanese/Chinese with UTF-16BE need 2 bytes for all characters.
94+ /// Issue #1065: Korean with EUC-KR needs 1 byte for ASCII, 2 bytes for Korean.
95+ pub fn write_char ( sym : & dtvcc_symbol , buf : & mut Vec < u8 > , use_utf16 : bool ) {
96+ let high = ( sym. sym >> 8 ) as u8 ;
97+ let low = ( sym. sym & 0xff ) as u8 ;
98+
99+ if use_utf16 {
100+ // UTF-16BE: Always write 2 bytes
101+ buf. push ( high) ;
102+ buf. push ( low) ;
103+ } else {
104+ // Variable-width: Only write high byte if non-zero
105+ if high != 0 {
106+ buf. push ( high) ;
107+ }
108+ buf. push ( low) ;
109+ }
110+ }
111+
112+ /// Check if a charset name indicates UTF-16 or UCS-2 encoding
113+ ///
114+ /// These are fixed-width 16-bit encodings where even ASCII needs 2 bytes.
115+ pub fn is_utf16_charset ( charset : & str ) -> bool {
116+ let upper = charset. to_uppercase ( ) ;
117+ upper. contains ( "UTF-16" )
118+ || upper. contains ( "UTF16" )
119+ || upper. contains ( "UCS-2" )
120+ || upper. contains ( "UCS2" )
121+ || upper. contains ( "UTF_16" )
122+ || upper. contains ( "UCS_2" )
96123}
97124
98125/// Convert from CEA-708 color representation to hex code
@@ -114,27 +141,71 @@ mod tests {
114141 use super :: * ;
115142
116143 #[ test]
117- fn test_write_char ( ) {
144+ fn test_write_char_utf16_mode ( ) {
118145 let mut buf = Vec :: new ( ) ;
119146
120- // Write ASCII symbol - UTF-16BE always uses 2 bytes
121- // 'A' (0x41) becomes [0x00, 0x41] in UTF-16BE
147+ // UTF-16 mode: ASCII symbol 'A' (0x41) becomes [0x00, 0x41]
122148 let sym = dtvcc_symbol { sym : 0x41 , init : 0 } ;
123- write_char ( & sym, & mut buf) ;
149+ write_char ( & sym, & mut buf, true ) ;
124150 assert_eq ! ( buf, vec![ 0x00 , 0x41 ] ) ;
125151
126152 buf. clear ( ) ;
127153
128- // Write non-ASCII symbol (e.g., Japanese character)
129- // Already 16-bit, writes as [high_byte, low_byte]
154+ // UTF-16 mode: Non-ASCII symbol writes as [high_byte, low_byte]
130155 let sym = dtvcc_symbol {
131156 sym : 0x1234 ,
132157 init : 0 ,
133158 } ;
134- write_char ( & sym, & mut buf) ;
159+ write_char ( & sym, & mut buf, true ) ;
135160 assert_eq ! ( buf, vec![ 0x12 , 0x34 ] ) ;
136161 }
137162
163+ #[ test]
164+ fn test_write_char_variable_width_mode ( ) {
165+ let mut buf = Vec :: new ( ) ;
166+
167+ // Variable-width mode: ASCII symbol 'A' (0x41) becomes [0x41] (1 byte)
168+ let sym = dtvcc_symbol { sym : 0x41 , init : 0 } ;
169+ write_char ( & sym, & mut buf, false ) ;
170+ assert_eq ! ( buf, vec![ 0x41 ] ) ;
171+
172+ buf. clear ( ) ;
173+
174+ // Variable-width mode: Korean EUC-KR char becomes [high, low] (2 bytes)
175+ // Example: Korean '인' = 0xC0CE in EUC-KR
176+ let sym = dtvcc_symbol {
177+ sym : 0xC0CE ,
178+ init : 0 ,
179+ } ;
180+ write_char ( & sym, & mut buf, false ) ;
181+ assert_eq ! ( buf, vec![ 0xC0 , 0xCE ] ) ;
182+
183+ buf. clear ( ) ;
184+
185+ // Variable-width mode: Space (0x20) becomes [0x20] (1 byte, no NUL)
186+ let sym = dtvcc_symbol { sym : 0x20 , init : 0 } ;
187+ write_char ( & sym, & mut buf, false ) ;
188+ assert_eq ! ( buf, vec![ 0x20 ] ) ;
189+ }
190+
191+ #[ test]
192+ fn test_is_utf16_charset ( ) {
193+ // Should return true for UTF-16 variants
194+ assert ! ( is_utf16_charset( "UTF-16BE" ) ) ;
195+ assert ! ( is_utf16_charset( "UTF-16LE" ) ) ;
196+ assert ! ( is_utf16_charset( "utf-16" ) ) ;
197+ assert ! ( is_utf16_charset( "UTF16" ) ) ;
198+ assert ! ( is_utf16_charset( "UCS-2" ) ) ;
199+ assert ! ( is_utf16_charset( "UCS2" ) ) ;
200+
201+ // Should return false for variable-width encodings
202+ assert ! ( !is_utf16_charset( "EUC-KR" ) ) ;
203+ assert ! ( !is_utf16_charset( "CP949" ) ) ;
204+ assert ! ( !is_utf16_charset( "Shift-JIS" ) ) ;
205+ assert ! ( !is_utf16_charset( "UTF-8" ) ) ;
206+ assert ! ( !is_utf16_charset( "ISO-8859-1" ) ) ;
207+ }
208+
138209 #[ test]
139210 fn test_color_to_hex ( ) {
140211 assert_eq ! ( color_to_hex( 0b00_00_00 ) , ( 0 , 0 , 0 ) ) ; // Black
0 commit comments