1515import java .nio .charset .StandardCharsets ;
1616
1717/**
18- * @author kapodamy
18+ * Converts TTML subtitles to SRT format.
19+ *
20+ * References:
21+ * - TTML 2.0 (W3C): https://www.w3.org/TR/ttml2/
22+ * - SRT format: https://en.wikipedia.org/wiki/SubRip
1923 */
2024public class SrtFromTtmlWriter {
2125 private static final String NEW_LINE = "\r \n " ;
@@ -59,6 +63,226 @@ private void writeString(final String text) throws IOException {
5963 out .write (text .getBytes (charset ));
6064 }
6165
66+ /**
67+ * Decode XML or HTML entities into their actual (literal) characters.
68+ *
69+ * TTML is XML-based, so text nodes may contain escaped entities
70+ * instead of direct characters. For example:
71+ *
72+ * "&" → "&"
73+ * "<" → "<"
74+ * ">" → ">"
75+ * "	" → "\t" (TAB)
76+ * "
" ( ) → "\n" (LINE FEED)
77+ *
78+ * XML files cannot contain characters like "<", ">", "&" directly,
79+ * so they must be represented using their entity-encoded forms.
80+ *
81+ * Jsoup sometimes leaves nested or encoded entities unresolved
82+ * (e.g. inside <p> text nodes in TTML files), so this function
83+ * acts as a final “safety net” to ensure all entities are decoded
84+ * before further normalization.
85+ *
86+ * Character representation layers for reference:
87+ * - Literal characters: <, >, &
88+ * → appear in runtime/output text (e.g. final SRT output)
89+ * - Escaped entities: <, >, &
90+ * → appear in XML/HTML/TTML source files
91+ * - Numeric entities:  , 	, 
92+ * → appear mainly in XML/TTML files (also valid in HTML)
93+ * for non-printable or special characters
94+ * - Unicode escapes: \u00A0 (Java/Unicode internal form)
95+ * → appear only in Java source code (NOT valid in XML)
96+ *
97+ * XML entities include both named (&, <) and numeric
98+ * ( ,  ) forms.
99+ *
100+ * @param encodedEntities The raw text fragment possibly containing
101+ * encoded XML entities.
102+ * @return A decoded string where all entities are replaced by their
103+ * actual (literal) characters.
104+ */
105+ private String decodeXmlEntities (final String encodedEntities ) {
106+ return Parser .unescapeEntities (encodedEntities , true );
107+ }
108+
109+ /**
110+ * Handle rare XML entity characters like LF: 
(`\n`),
111+ * CR: 
(`\r`) and CRLF: (`\r\n`).
112+ *
113+ * These are technically valid in TTML (XML allows them)
114+ * but unusual in practice, since most TTML line breaks
115+ * are represented as <br/> tags instead.
116+ * As a defensive approach, we normalize them:
117+ *
118+ * - Windows (\r\n), macOS (\r), and Unix (\n) → unified SRT NEW_LINE (\r\n)
119+ *
120+ * Although well-formed TTML normally encodes line breaks
121+ * as <br/> tags, some auto-generated or malformed TTML files
122+ * may embed literal newline entities (
, 
). This
123+ * normalization ensures these cases render properly in SRT
124+ * players instead of breaking the subtitle structure.
125+ *
126+ * @param text To be normalized text with actual characters.
127+ * @return Unified SRT NEW_LINE converted from all kinds of line breaks.
128+ */
129+ private String normalizeLineBreakForSrt (final String text ) {
130+ String cleaned = text ;
131+
132+ // NOTE:
133+ // The order of newline replacements must NOT change,
134+ // or duplicated line breaks (e.g. \r\n → \n\n) will occur.
135+ cleaned = cleaned .replace ("\r \n " , "\n " )
136+ .replace ("\r " , "\n " );
137+
138+ cleaned = cleaned .replace ("\n " , NEW_LINE );
139+
140+ return cleaned ;
141+ }
142+
143+ private String normalizeForSrt (final String actualText ) {
144+ String cleaned = actualText ;
145+
146+ // Replace NBSP "non-breaking space" (\u00A0) with regular space ' '(\u0020).
147+ //
148+ // Why:
149+ // - Some viewers render NBSP(\u00A0) incorrectly:
150+ // * MPlayer 1.5: shown as “??”
151+ // * Linux command `cat -A`: displayed as control-like markers
152+ // (M-BM-)
153+ // * Acode (Android editor): displayed as visible replacement
154+ // glyphs (red dots)
155+ // - Other viewers show it as a normal space (e.g., VS Code 1.104.0,
156+ // vlc 3.0.20, mpv 0.37.0, Totem 43.0)
157+ // → Mixed rendering creates inconsistency and may confuse users.
158+ //
159+ // Details:
160+ // - YouTube TTML subtitles use both regular spaces (\u0020)
161+ // and non-breaking spaces (\u00A0).
162+ // - SRT subtitles only support regular spaces (\u0020),
163+ // so \u00A0 may cause display issues.
164+ // - \u00A0 and \u0020 are visually identical (i.e., they both
165+ // appear as spaces ' '), but they differ in Unicode encoding,
166+ // and NBSP (\u00A0) renders differently in different viewers.
167+ // - SRT is a plain-text format and does not interpret
168+ // "non-breaking" behavior.
169+ //
170+ // Conclusion:
171+ // - Ensure uniform behavior, so replace it to a regular space
172+ // without "non-breaking" behavior.
173+ //
174+ // References:
175+ // - Unicode U+00A0 NBSP (Latin-1 Supplement):
176+ // https://unicode.org/charts/PDF/U0080.pdf
177+ cleaned = cleaned .replace ('\u00A0' , ' ' ) // Non-breaking space
178+ .replace ('\u202F' , ' ' ) // Narrow no-break space
179+ .replace ('\u205F' , ' ' ) // Medium mathematical space
180+ .replace ('\u3000' , ' ' ) // Ideographic space
181+ // \u2000 ~ \u200A are whitespace characters (e.g.,
182+ // en space, em space), replaced with regular space (\u0020).
183+ .replaceAll ("[\\ u2000-\\ u200A]" , " " ); // Whitespace characters
184+
185+ // \u200B ~ \u200F are a range of non-spacing characters
186+ // (e.g., zero-width space, zero-width non-joiner, etc.),
187+ // which have no effect in *.SRT files and may cause
188+ // display issues.
189+ // These characters are invisible to the human eye, and
190+ // they still exist in the encoding, so they need to be
191+ // removed.
192+ // After removal, the actual content becomes completely
193+ // empty "", meaning there are no characters left, just
194+ // an empty space, which helps avoid formatting issues
195+ // in subtitles.
196+ cleaned = cleaned .replaceAll ("[\\ u200B-\\ u200F]" , "" ); // Non-spacing characters
197+
198+ // Remove control characters (\u0000 ~ \u001F, except
199+ // \n, \r, \t).
200+ // - These are ASCII C0 control codes (e.g. \u0001 SOH,
201+ // \u0008 BS, \u001F US), invisible and irrelevant in
202+ // subtitles, may cause square boxes (?) in players.
203+ // - Reference:
204+ // Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf)
205+ // ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters)
206+ cleaned = cleaned .replaceAll ("[\\ u0000-\\ u0008\\ u000B\\ u000C\\ u000E-\\ u001F]" , "" );
207+
208+ // Reasoning:
209+ // - subtitle files generally don't require tabs for alignment.
210+ // - Tabs can be displayed with varying widths across different
211+ // editors or platforms, which may cause display issues.
212+ // - Replace it with a single space for consistent display
213+ // across different editors or platforms.
214+ cleaned = cleaned .replace ('\t' , ' ' );
215+
216+ cleaned = normalizeLineBreakForSrt (cleaned );
217+
218+ return cleaned ;
219+ }
220+
221+ private String sanitizeFragment (final String raw ) {
222+ if (null == raw ) {
223+ return "" ;
224+ }
225+
226+ final String actualCharacters = decodeXmlEntities (raw );
227+
228+ final String srtSafeText = normalizeForSrt (actualCharacters );
229+
230+ return srtSafeText ;
231+ }
232+
233+ // Recursively process all child nodes to ensure text inside
234+ // nested tags (e.g., <span>) is also extracted.
235+ private void traverseChildNodesForNestedTags (final Node parent ,
236+ final StringBuilder text ) {
237+ for (final Node child : parent .childNodes ()) {
238+ extractText (child , text );
239+ }
240+ }
241+
242+ // CHECKSTYLE:OFF checkstyle:JavadocStyle
243+ // checkstyle does not understand that span tags are inside a code block
244+ /**
245+ * <p>Recursive method to extract text from all nodes.</p>
246+ * <p>
247+ * This method processes {@link TextNode}s and {@code <br>} tags,
248+ * recursively extracting text from nested tags
249+ * (e.g. extracting text from nested {@code <span>} tags).
250+ * Newlines are added for {@code <br>} tags.
251+ * </p>
252+ * @param node the current node to process
253+ * @param text the {@link StringBuilder} to append the extracted text to
254+ */
255+ // --------------------------------------------------------------------
256+ // [INTERNAL NOTE] TTML text layer explanation
257+ //
258+ // TTML parsing involves multiple text "layers":
259+ // 1. Raw XML entities (e.g., <,  ) are decoded by Jsoup.
260+ // 2. extractText() works on DOM TextNodes (already parsed strings).
261+ // 3. sanitizeFragment() decodes remaining entities and fixes
262+ // Unicode quirks.
263+ // 4. normalizeForSrt() ensures literal text is safe for SRT output.
264+ //
265+ // In short:
266+ // Jsoup handles XML-level syntax,
267+ // our code handles text-level normalization for subtitles.
268+ // --------------------------------------------------------------------
269+ private void extractText (final Node node , final StringBuilder text ) {
270+ if (node instanceof TextNode textNode ) {
271+ String rawTtmlFragment = textNode .getWholeText ();
272+ String srtContent = sanitizeFragment (rawTtmlFragment );
273+ text .append (srtContent );
274+ } else if (node instanceof Element element ) {
275+ // <br> is a self-closing HTML tag used to insert a line break.
276+ if (element .tagName ().equalsIgnoreCase ("br" )) {
277+ // Add a newline for <br> tags
278+ text .append (NEW_LINE );
279+ }
280+ }
281+
282+ traverseChildNodesForNestedTags (node , text );
283+ }
284+ // CHECKSTYLE:ON
285+
62286 public void build (final SharpStream ttml ) throws IOException {
63287 /*
64288 * TTML parser with BASIC support
@@ -79,21 +303,15 @@ public void build(final SharpStream ttml) throws IOException {
79303 final Elements paragraphList = doc .select ("body > div > p" );
80304
81305 // check if has frames
82- if (paragraphList .size () < 1 ) {
306+ if (paragraphList .isEmpty () ) {
83307 return ;
84308 }
85309
86310 for (final Element paragraph : paragraphList ) {
87311 text .setLength (0 );
88312
89- for (final Node children : paragraph .childNodes ()) {
90- if (children instanceof TextNode ) {
91- text .append (((TextNode ) children ).text ());
92- } else if (children instanceof Element
93- && ((Element ) children ).tagName ().equalsIgnoreCase ("br" )) {
94- text .append (NEW_LINE );
95- }
96- }
313+ // Recursively extract text from all child nodes
314+ extractText (paragraph , text );
97315
98316 if (ignoreEmptyFrames && text .length () < 1 ) {
99317 continue ;
0 commit comments