|
15 | 15 | import java.nio.charset.StandardCharsets; |
16 | 16 |
|
17 | 17 | /** |
18 | | - * @author kapodamy |
| 18 | + * Converts TTML subtitles to SRT format. |
| 19 | + * |
| 20 | + * References: |
| 21 | + * - TTML 2.0 (W3C): https://www.w3.org/TR/ttml2/ |
| 22 | + * - SRT format: https://en.wikipedia.org/wiki/SubRip |
19 | 23 | */ |
20 | 24 | public class SrtFromTtmlWriter { |
21 | 25 | private static final String NEW_LINE = "\r\n"; |
@@ -135,20 +139,37 @@ private String normalizeLineBreakForSrt(final String text) { |
135 | 139 | private String normalizeForSrt(final String actualText) { |
136 | 140 | String cleaned = actualText; |
137 | 141 |
|
138 | | - // Replace non-breaking space (\u00A0) with regular space ' '(\u0020). |
| 142 | + // Replace NBSP "non-breaking space" (\u00A0) with regular space ' '(\u0020). |
| 143 | + // |
| 144 | + // Why: |
| 145 | + // - Some viewers render NBSP(\u00A0) incorrectly: |
| 146 | + // * MPlayer 1.5: shown as “??” |
| 147 | + // * Linux command `cat -A`: displayed as control-like markers |
| 148 | + // (M-BM-) |
| 149 | + // * Acode (Android editor): displayed as visible replacement |
| 150 | + // glyphs (red dots) |
| 151 | + // - Other viewers show it as a normal space (e.g., VS Code 1.104.0, |
| 152 | + // vlc 3.0.20, mpv 0.37.0, Totem 43.0) |
| 153 | + // → Mixed rendering creates inconsistency and may confuse users. |
| 154 | + // |
| 155 | + // Details: |
139 | 156 | // - YouTube TTML subtitles use both regular spaces (\u0020) |
140 | 157 | // and non-breaking spaces (\u00A0). |
141 | 158 | // - SRT subtitles only support regular spaces (\u0020), |
142 | 159 | // so \u00A0 may cause display issues. |
143 | 160 | // - \u00A0 and \u0020 are visually identical (i.e., they both |
144 | 161 | // appear as spaces ' '), but they differ in Unicode encoding, |
145 | | - // leading to test failures (e.g., ComparisonFailure). |
146 | | - // - Convert \u00A0 to \u0020 to ensure consistency in subtitle |
147 | | - // formatting. |
148 | | - // - References: |
149 | | - // - Unicode General Punctuation: https://unicode.org/charts/PDF/U2000.pdf |
150 | | - // - TTML Spec: https://www.w3.org/TR/ttml2/ |
151 | | - // - SRT Format: https://en.wikipedia.org/wiki/SubRip |
| 162 | + // and NBSP (\u00A0) renders differently in different viewers. |
| 163 | + // - SRT is a plain-text format and does not interpret |
| 164 | + // "non-breaking" behavior. |
| 165 | + // |
| 166 | + // Conclusion: |
| 167 | + // - Ensure uniform behavior, so replace it to a regular space |
| 168 | + // without "non-breaking" behavior. |
| 169 | + // |
| 170 | + // References: |
| 171 | + // - Unicode U+00A0 NBSP (Latin-1 Supplement): |
| 172 | + // https://unicode.org/charts/PDF/U0080.pdf |
152 | 173 | cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space |
153 | 174 | .replace('\u202F', ' ') // Narrow no-break space |
154 | 175 | .replace('\u205F', ' ') // Medium mathematical space |
|
0 commit comments