fix(708): Support Korean EUC-KR encoding in CEA-708 decoder

cfsmp3 · claude · cfsmp3 · commit da3dc52b45de · 2025-12-21T09:43:27.000+01:00
Korean broadcasts use EUC-KR encoding (variable-width) in CEA-708 captions, where ASCII is 1 byte and Korean characters are 2 bytes. The decoder was always writing 2 bytes per character (UTF-16BE style), causing NULL bytes to be inserted before every ASCII character. Changes: - Add is_utf16_charset() to detect fixed-width 16-bit encodings - Modify write_char() to accept use_utf16 flag: - true: Always 2 bytes (UTF-16BE for Japanese, issue #1451) - false: 1 byte for ASCII, 2 bytes for extended (EUC-KR for Korean) - Detect charset type in write_row() before building output buffer This fixes Korean subtitle extraction when using --service "1[EUC-KR]" while maintaining compatibility with Japanese UTF-16BE (issue #1451). Closes #1065 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/src/rust/src/decoder/output.rs b/src/rust/src/decoder/output.rs
@@ -85,14 +85,38 @@ impl<'a> Writer<'a> {
 
 /// Write the symbol to the provided buffer
 ///
-/// Always writes 2 bytes for consistent UTF-16BE encoding.
-/// Previously, this function wrote 1 byte for ASCII characters and 2 bytes
-/// for non-ASCII, creating an invalid mix that encoding conversion couldn't
-/// handle properly. This caused garbled output with Japanese/Chinese characters
-/// (issue #1451).
-pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec<u8>) {
-    buf.push((sym.sym >> 8) as u8);
-    buf.push((sym.sym & 0xff) as u8);
+/// The `use_utf16` parameter controls the output format:
+/// - `true`: Always writes 2 bytes (UTF-16BE format). Use for UTF-16/UCS-2 charsets.
+/// - `false`: Writes 1 byte for ASCII (high byte == 0), 2 bytes for extended chars.
+///   Use for variable-width encodings like EUC-KR, CP949, Shift-JIS, etc.
+///
+/// Issue #1451: Japanese/Chinese with UTF-16BE need 2 bytes for all characters.
+/// Issue #1065: Korean with EUC-KR needs 1 byte for ASCII, 2 bytes for Korean.
+pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec<u8>, use_utf16: bool) {
+    let high = (sym.sym >> 8) as u8;
+    let low = (sym.sym & 0xff) as u8;
+
+    if use_utf16 {
+        // UTF-16BE: Always write 2 bytes
+        buf.push(high);
+        buf.push(low);
+    } else {
+        // Variable-width: Only write high byte if non-zero
+        if high != 0 {
+            buf.push(high);
+        }
+        buf.push(low);
+    }
+}
+
+/// Check if a charset name indicates UTF-16 or UCS-2 encoding
+///
+/// These are fixed-width 16-bit encodings where even ASCII needs 2 bytes.
+pub fn is_utf16_charset(charset: &str) -> bool {
+    let upper = charset.to_uppercase();
+    upper.contains("UTF-16") || upper.contains("UTF16") ||
+    upper.contains("UCS-2") || upper.contains("UCS2") ||
+    upper.contains("UTF_16") || upper.contains("UCS_2")
 }
 
 /// Convert from CEA-708 color representation to hex code
@@ -114,27 +138,71 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_write_char() {
+    fn test_write_char_utf16_mode() {
         let mut buf = Vec::new();
 
-        // Write ASCII symbol - UTF-16BE always uses 2 bytes
-        // 'A' (0x41) becomes [0x00, 0x41] in UTF-16BE
+        // UTF-16 mode: ASCII symbol 'A' (0x41) becomes [0x00, 0x41]
         let sym = dtvcc_symbol { sym: 0x41, init: 0 };
-        write_char(&sym, &mut buf);
+        write_char(&sym, &mut buf, true);
         assert_eq!(buf, vec![0x00, 0x41]);
 
         buf.clear();
 
-        // Write non-ASCII symbol (e.g., Japanese character)
-        // Already 16-bit, writes as [high_byte, low_byte]
+        // UTF-16 mode: Non-ASCII symbol writes as [high_byte, low_byte]
         let sym = dtvcc_symbol {
             sym: 0x1234,
             init: 0,
         };
-        write_char(&sym, &mut buf);
+        write_char(&sym, &mut buf, true);
         assert_eq!(buf, vec![0x12, 0x34]);
     }
 
+    #[test]
+    fn test_write_char_variable_width_mode() {
+        let mut buf = Vec::new();
+
+        // Variable-width mode: ASCII symbol 'A' (0x41) becomes [0x41] (1 byte)
+        let sym = dtvcc_symbol { sym: 0x41, init: 0 };
+        write_char(&sym, &mut buf, false);
+        assert_eq!(buf, vec![0x41]);
+
+        buf.clear();
+
+        // Variable-width mode: Korean EUC-KR char becomes [high, low] (2 bytes)
+        // Example: Korean '인' = 0xC0CE in EUC-KR
+        let sym = dtvcc_symbol {
+            sym: 0xC0CE,
+            init: 0,
+        };
+        write_char(&sym, &mut buf, false);
+        assert_eq!(buf, vec![0xC0, 0xCE]);
+
+        buf.clear();
+
+        // Variable-width mode: Space (0x20) becomes [0x20] (1 byte, no NUL)
+        let sym = dtvcc_symbol { sym: 0x20, init: 0 };
+        write_char(&sym, &mut buf, false);
+        assert_eq!(buf, vec![0x20]);
+    }
+
+    #[test]
+    fn test_is_utf16_charset() {
+        // Should return true for UTF-16 variants
+        assert!(is_utf16_charset("UTF-16BE"));
+        assert!(is_utf16_charset("UTF-16LE"));
+        assert!(is_utf16_charset("utf-16"));
+        assert!(is_utf16_charset("UTF16"));
+        assert!(is_utf16_charset("UCS-2"));
+        assert!(is_utf16_charset("UCS2"));
+
+        // Should return false for variable-width encodings
+        assert!(!is_utf16_charset("EUC-KR"));
+        assert!(!is_utf16_charset("CP949"));
+        assert!(!is_utf16_charset("Shift-JIS"));
+        assert!(!is_utf16_charset("UTF-8"));
+        assert!(!is_utf16_charset("ISO-8859-1"));
+    }
+
     #[test]
     fn test_color_to_hex() {
         assert_eq!(color_to_hex(0b00_00_00), (0, 0, 0)); // Black
diff --git a/src/rust/src/decoder/tv_screen.rs b/src/rust/src/decoder/tv_screen.rs
@@ -13,7 +13,7 @@ use std::{ffi::CStr, fs::File};
 #[cfg(windows)]
 use crate::bindings::_get_osfhandle;
 
-use super::output::{color_to_hex, write_char, Writer};
+use super::output::{color_to_hex, is_utf16_charset, write_char, Writer};
 use super::timing::{get_scc_time_str, get_time_str};
 use super::{CCX_DTVCC_SCREENGRID_COLUMNS, CCX_DTVCC_SCREENGRID_ROWS};
 use crate::{
@@ -177,6 +177,23 @@ impl dtvcc_tv_screen {
         let (first, last) = self.get_write_interval(row_index);
         debug!("First: {first}, Last: {last}");
 
+        // Determine if we should use UTF-16 mode (2 bytes for all chars) or
+        // variable-width mode (1 byte for ASCII, 2 bytes for extended chars).
+        // UTF-16/UCS-2 encodings require 2 bytes even for ASCII.
+        // Variable-width encodings (EUC-KR, CP949, Shift-JIS, etc.) use 1 byte for ASCII.
+        let use_utf16 = if !writer.writer_ctx.charset.is_null() {
+            let charset = unsafe {
+                CStr::from_ptr(writer.writer_ctx.charset)
+                    .to_str()
+                    .unwrap_or("")
+            };
+            is_utf16_charset(charset)
+        } else {
+            // No charset specified - default to variable-width for backward compatibility
+            // with raw byte output (no encoding conversion)
+            false
+        };
+
         for i in 0..last + 1 {
             if use_colors {
                 self.change_pen_color(
@@ -219,7 +236,7 @@ impl dtvcc_tv_screen {
             if i < first {
                 buf.push(b' ');
             } else {
-                write_char(&self.chars[row_index][i], &mut buf)
+                write_char(&self.chars[row_index][i], &mut buf, use_utf16)
             }
         }
         // there can be unclosed tags or colors after the last symbol in a row