Skip to content

Commit da3dc52

Browse files
cfsmp3claude
andcommitted
fix(708): Support Korean EUC-KR encoding in CEA-708 decoder
Korean broadcasts use EUC-KR encoding (variable-width) in CEA-708 captions, where ASCII is 1 byte and Korean characters are 2 bytes. The decoder was always writing 2 bytes per character (UTF-16BE style), causing NULL bytes to be inserted before every ASCII character. Changes: - Add is_utf16_charset() to detect fixed-width 16-bit encodings - Modify write_char() to accept use_utf16 flag: - true: Always 2 bytes (UTF-16BE for Japanese, issue #1451) - false: 1 byte for ASCII, 2 bytes for extended (EUC-KR for Korean) - Detect charset type in write_row() before building output buffer This fixes Korean subtitle extraction when using --service "1[EUC-KR]" while maintaining compatibility with Japanese UTF-16BE (issue #1451). Closes #1065 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 0fdfb75 commit da3dc52

File tree

2 files changed

+102
-17
lines changed

2 files changed

+102
-17
lines changed

src/rust/src/decoder/output.rs

Lines changed: 83 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,38 @@ impl<'a> Writer<'a> {
8585

8686
/// Write the symbol to the provided buffer
8787
///
88-
/// Always writes 2 bytes for consistent UTF-16BE encoding.
89-
/// Previously, this function wrote 1 byte for ASCII characters and 2 bytes
90-
/// for non-ASCII, creating an invalid mix that encoding conversion couldn't
91-
/// handle properly. This caused garbled output with Japanese/Chinese characters
92-
/// (issue #1451).
93-
pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec<u8>) {
94-
buf.push((sym.sym >> 8) as u8);
95-
buf.push((sym.sym & 0xff) as u8);
88+
/// The `use_utf16` parameter controls the output format:
89+
/// - `true`: Always writes 2 bytes (UTF-16BE format). Use for UTF-16/UCS-2 charsets.
90+
/// - `false`: Writes 1 byte for ASCII (high byte == 0), 2 bytes for extended chars.
91+
/// Use for variable-width encodings like EUC-KR, CP949, Shift-JIS, etc.
92+
///
93+
/// Issue #1451: Japanese/Chinese with UTF-16BE need 2 bytes for all characters.
94+
/// Issue #1065: Korean with EUC-KR needs 1 byte for ASCII, 2 bytes for Korean.
95+
pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec<u8>, use_utf16: bool) {
96+
let high = (sym.sym >> 8) as u8;
97+
let low = (sym.sym & 0xff) as u8;
98+
99+
if use_utf16 {
100+
// UTF-16BE: Always write 2 bytes
101+
buf.push(high);
102+
buf.push(low);
103+
} else {
104+
// Variable-width: Only write high byte if non-zero
105+
if high != 0 {
106+
buf.push(high);
107+
}
108+
buf.push(low);
109+
}
110+
}
111+
112+
/// Check if a charset name indicates UTF-16 or UCS-2 encoding
113+
///
114+
/// These are fixed-width 16-bit encodings where even ASCII needs 2 bytes.
115+
pub fn is_utf16_charset(charset: &str) -> bool {
116+
let upper = charset.to_uppercase();
117+
upper.contains("UTF-16") || upper.contains("UTF16") ||
118+
upper.contains("UCS-2") || upper.contains("UCS2") ||
119+
upper.contains("UTF_16") || upper.contains("UCS_2")
96120
}
97121

98122
/// Convert from CEA-708 color representation to hex code
@@ -114,27 +138,71 @@ mod tests {
114138
use super::*;
115139

116140
#[test]
117-
fn test_write_char() {
141+
fn test_write_char_utf16_mode() {
118142
let mut buf = Vec::new();
119143

120-
// Write ASCII symbol - UTF-16BE always uses 2 bytes
121-
// 'A' (0x41) becomes [0x00, 0x41] in UTF-16BE
144+
// UTF-16 mode: ASCII symbol 'A' (0x41) becomes [0x00, 0x41]
122145
let sym = dtvcc_symbol { sym: 0x41, init: 0 };
123-
write_char(&sym, &mut buf);
146+
write_char(&sym, &mut buf, true);
124147
assert_eq!(buf, vec![0x00, 0x41]);
125148

126149
buf.clear();
127150

128-
// Write non-ASCII symbol (e.g., Japanese character)
129-
// Already 16-bit, writes as [high_byte, low_byte]
151+
// UTF-16 mode: Non-ASCII symbol writes as [high_byte, low_byte]
130152
let sym = dtvcc_symbol {
131153
sym: 0x1234,
132154
init: 0,
133155
};
134-
write_char(&sym, &mut buf);
156+
write_char(&sym, &mut buf, true);
135157
assert_eq!(buf, vec![0x12, 0x34]);
136158
}
137159

160+
#[test]
161+
fn test_write_char_variable_width_mode() {
162+
let mut buf = Vec::new();
163+
164+
// Variable-width mode: ASCII symbol 'A' (0x41) becomes [0x41] (1 byte)
165+
let sym = dtvcc_symbol { sym: 0x41, init: 0 };
166+
write_char(&sym, &mut buf, false);
167+
assert_eq!(buf, vec![0x41]);
168+
169+
buf.clear();
170+
171+
// Variable-width mode: Korean EUC-KR char becomes [high, low] (2 bytes)
172+
// Example: Korean '인' = 0xC0CE in EUC-KR
173+
let sym = dtvcc_symbol {
174+
sym: 0xC0CE,
175+
init: 0,
176+
};
177+
write_char(&sym, &mut buf, false);
178+
assert_eq!(buf, vec![0xC0, 0xCE]);
179+
180+
buf.clear();
181+
182+
// Variable-width mode: Space (0x20) becomes [0x20] (1 byte, no NUL)
183+
let sym = dtvcc_symbol { sym: 0x20, init: 0 };
184+
write_char(&sym, &mut buf, false);
185+
assert_eq!(buf, vec![0x20]);
186+
}
187+
188+
#[test]
189+
fn test_is_utf16_charset() {
190+
// Should return true for UTF-16 variants
191+
assert!(is_utf16_charset("UTF-16BE"));
192+
assert!(is_utf16_charset("UTF-16LE"));
193+
assert!(is_utf16_charset("utf-16"));
194+
assert!(is_utf16_charset("UTF16"));
195+
assert!(is_utf16_charset("UCS-2"));
196+
assert!(is_utf16_charset("UCS2"));
197+
198+
// Should return false for variable-width encodings
199+
assert!(!is_utf16_charset("EUC-KR"));
200+
assert!(!is_utf16_charset("CP949"));
201+
assert!(!is_utf16_charset("Shift-JIS"));
202+
assert!(!is_utf16_charset("UTF-8"));
203+
assert!(!is_utf16_charset("ISO-8859-1"));
204+
}
205+
138206
#[test]
139207
fn test_color_to_hex() {
140208
assert_eq!(color_to_hex(0b00_00_00), (0, 0, 0)); // Black

src/rust/src/decoder/tv_screen.rs

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use std::{ffi::CStr, fs::File};
1313
#[cfg(windows)]
1414
use crate::bindings::_get_osfhandle;
1515

16-
use super::output::{color_to_hex, write_char, Writer};
16+
use super::output::{color_to_hex, is_utf16_charset, write_char, Writer};
1717
use super::timing::{get_scc_time_str, get_time_str};
1818
use super::{CCX_DTVCC_SCREENGRID_COLUMNS, CCX_DTVCC_SCREENGRID_ROWS};
1919
use crate::{
@@ -177,6 +177,23 @@ impl dtvcc_tv_screen {
177177
let (first, last) = self.get_write_interval(row_index);
178178
debug!("First: {first}, Last: {last}");
179179

180+
// Determine if we should use UTF-16 mode (2 bytes for all chars) or
181+
// variable-width mode (1 byte for ASCII, 2 bytes for extended chars).
182+
// UTF-16/UCS-2 encodings require 2 bytes even for ASCII.
183+
// Variable-width encodings (EUC-KR, CP949, Shift-JIS, etc.) use 1 byte for ASCII.
184+
let use_utf16 = if !writer.writer_ctx.charset.is_null() {
185+
let charset = unsafe {
186+
CStr::from_ptr(writer.writer_ctx.charset)
187+
.to_str()
188+
.unwrap_or("")
189+
};
190+
is_utf16_charset(charset)
191+
} else {
192+
// No charset specified - default to variable-width for backward compatibility
193+
// with raw byte output (no encoding conversion)
194+
false
195+
};
196+
180197
for i in 0..last + 1 {
181198
if use_colors {
182199
self.change_pen_color(
@@ -219,7 +236,7 @@ impl dtvcc_tv_screen {
219236
if i < first {
220237
buf.push(b' ');
221238
} else {
222-
write_char(&self.chars[row_index][i], &mut buf)
239+
write_char(&self.chars[row_index][i], &mut buf, use_utf16)
223240
}
224241
}
225242
// there can be unclosed tags or colors after the last symbol in a row

0 commit comments

Comments
 (0)