Skip to content

Commit 0a89276

Browse files
authored
Merge pull request #12575 from TransZAllen/dev
[Bug] Fix missing subtitle text in manually downloaded *.SRT files. (issue #10030)
2 parents 2a9c6f0 + 300afde commit 0a89276

File tree

2 files changed

+548
-10
lines changed

2 files changed

+548
-10
lines changed

app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java

Lines changed: 228 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@
1515
import java.nio.charset.StandardCharsets;
1616

1717
/**
18-
* @author kapodamy
18+
* Converts TTML subtitles to SRT format.
19+
*
20+
* References:
21+
* - TTML 2.0 (W3C): https://www.w3.org/TR/ttml2/
22+
* - SRT format: https://en.wikipedia.org/wiki/SubRip
1923
*/
2024
public class SrtFromTtmlWriter {
2125
private static final String NEW_LINE = "\r\n";
@@ -59,6 +63,226 @@ private void writeString(final String text) throws IOException {
5963
out.write(text.getBytes(charset));
6064
}
6165

66+
/**
67+
* Decode XML or HTML entities into their actual (literal) characters.
68+
*
69+
* TTML is XML-based, so text nodes may contain escaped entities
70+
* instead of direct characters. For example:
71+
*
72+
* "&" → "&"
73+
* "&lt;" → "<"
74+
* "&gt;" → ">"
75+
* "&#x9;" → "\t" (TAB)
76+
* "&#xA;" (&#10;) → "\n" (LINE FEED)
77+
*
78+
* XML files cannot contain characters like "<", ">", "&" directly,
79+
* so they must be represented using their entity-encoded forms.
80+
*
81+
* Jsoup sometimes leaves nested or encoded entities unresolved
82+
* (e.g. inside <p> text nodes in TTML files), so this function
83+
* acts as a final “safety net” to ensure all entities are decoded
84+
* before further normalization.
85+
*
86+
* Character representation layers for reference:
87+
* - Literal characters: <, >, &
88+
* → appear in runtime/output text (e.g. final SRT output)
89+
* - Escaped entities: &lt;, &gt;, &amp;
90+
* → appear in XML/HTML/TTML source files
91+
* - Numeric entities: &#xA0;, &#x9;, &#xD;
92+
* → appear mainly in XML/TTML files (also valid in HTML)
93+
* for non-printable or special characters
94+
* - Unicode escapes: \u00A0 (Java/Unicode internal form)
95+
* → appear only in Java source code (NOT valid in XML)
96+
*
97+
* XML entities include both named (&amp;, &lt;) and numeric
98+
* (&#xA0;, &#160;) forms.
99+
*
100+
* @param encodedEntities The raw text fragment possibly containing
101+
* encoded XML entities.
102+
* @return A decoded string where all entities are replaced by their
103+
* actual (literal) characters.
104+
*/
105+
private String decodeXmlEntities(final String encodedEntities) {
106+
return Parser.unescapeEntities(encodedEntities, true);
107+
}
108+
109+
/**
110+
* Handle rare XML entity characters like LF: &#xA;(`\n`),
111+
* CR: &#xD;(`\r`) and CRLF: (`\r\n`).
112+
*
113+
* These are technically valid in TTML (XML allows them)
114+
* but unusual in practice, since most TTML line breaks
115+
* are represented as <br/> tags instead.
116+
* As a defensive approach, we normalize them:
117+
*
118+
* - Windows (\r\n), macOS (\r), and Unix (\n) → unified SRT NEW_LINE (\r\n)
119+
*
120+
* Although well-formed TTML normally encodes line breaks
121+
* as <br/> tags, some auto-generated or malformed TTML files
122+
* may embed literal newline entities (&#xA;, &#xD;). This
123+
* normalization ensures these cases render properly in SRT
124+
* players instead of breaking the subtitle structure.
125+
*
126+
* @param text To be normalized text with actual characters.
127+
* @return Unified SRT NEW_LINE converted from all kinds of line breaks.
128+
*/
129+
private String normalizeLineBreakForSrt(final String text) {
130+
String cleaned = text;
131+
132+
// NOTE:
133+
// The order of newline replacements must NOT change,
134+
// or duplicated line breaks (e.g. \r\n → \n\n) will occur.
135+
cleaned = cleaned.replace("\r\n", "\n")
136+
.replace("\r", "\n");
137+
138+
cleaned = cleaned.replace("\n", NEW_LINE);
139+
140+
return cleaned;
141+
}
142+
143+
private String normalizeForSrt(final String actualText) {
144+
String cleaned = actualText;
145+
146+
// Replace NBSP "non-breaking space" (\u00A0) with regular space ' '(\u0020).
147+
//
148+
// Why:
149+
// - Some viewers render NBSP(\u00A0) incorrectly:
150+
// * MPlayer 1.5: shown as “??”
151+
// * Linux command `cat -A`: displayed as control-like markers
152+
// (M-BM-)
153+
// * Acode (Android editor): displayed as visible replacement
154+
// glyphs (red dots)
155+
// - Other viewers show it as a normal space (e.g., VS Code 1.104.0,
156+
// vlc 3.0.20, mpv 0.37.0, Totem 43.0)
157+
// → Mixed rendering creates inconsistency and may confuse users.
158+
//
159+
// Details:
160+
// - YouTube TTML subtitles use both regular spaces (\u0020)
161+
// and non-breaking spaces (\u00A0).
162+
// - SRT subtitles only support regular spaces (\u0020),
163+
// so \u00A0 may cause display issues.
164+
// - \u00A0 and \u0020 are visually identical (i.e., they both
165+
// appear as spaces ' '), but they differ in Unicode encoding,
166+
// and NBSP (\u00A0) renders differently in different viewers.
167+
// - SRT is a plain-text format and does not interpret
168+
// "non-breaking" behavior.
169+
//
170+
// Conclusion:
171+
// - Ensure uniform behavior, so replace it to a regular space
172+
// without "non-breaking" behavior.
173+
//
174+
// References:
175+
// - Unicode U+00A0 NBSP (Latin-1 Supplement):
176+
// https://unicode.org/charts/PDF/U0080.pdf
177+
cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space
178+
.replace('\u202F', ' ') // Narrow no-break space
179+
.replace('\u205F', ' ') // Medium mathematical space
180+
.replace('\u3000', ' ') // Ideographic space
181+
// \u2000 ~ \u200A are whitespace characters (e.g.,
182+
// en space, em space), replaced with regular space (\u0020).
183+
.replaceAll("[\\u2000-\\u200A]", " "); // Whitespace characters
184+
185+
// \u200B ~ \u200F are a range of non-spacing characters
186+
// (e.g., zero-width space, zero-width non-joiner, etc.),
187+
// which have no effect in *.SRT files and may cause
188+
// display issues.
189+
// These characters are invisible to the human eye, and
190+
// they still exist in the encoding, so they need to be
191+
// removed.
192+
// After removal, the actual content becomes completely
193+
// empty "", meaning there are no characters left, just
194+
// an empty space, which helps avoid formatting issues
195+
// in subtitles.
196+
cleaned = cleaned.replaceAll("[\\u200B-\\u200F]", ""); // Non-spacing characters
197+
198+
// Remove control characters (\u0000 ~ \u001F, except
199+
// \n, \r, \t).
200+
// - These are ASCII C0 control codes (e.g. \u0001 SOH,
201+
// \u0008 BS, \u001F US), invisible and irrelevant in
202+
// subtitles, may cause square boxes (?) in players.
203+
// - Reference:
204+
// Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf)
205+
// ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters)
206+
cleaned = cleaned.replaceAll("[\\u0000-\\u0008\\u000B\\u000C\\u000E-\\u001F]", "");
207+
208+
// Reasoning:
209+
// - subtitle files generally don't require tabs for alignment.
210+
// - Tabs can be displayed with varying widths across different
211+
// editors or platforms, which may cause display issues.
212+
// - Replace it with a single space for consistent display
213+
// across different editors or platforms.
214+
cleaned = cleaned.replace('\t', ' ');
215+
216+
cleaned = normalizeLineBreakForSrt(cleaned);
217+
218+
return cleaned;
219+
}
220+
221+
private String sanitizeFragment(final String raw) {
222+
if (null == raw) {
223+
return "";
224+
}
225+
226+
final String actualCharacters = decodeXmlEntities(raw);
227+
228+
final String srtSafeText = normalizeForSrt(actualCharacters);
229+
230+
return srtSafeText;
231+
}
232+
233+
// Recursively process all child nodes to ensure text inside
234+
// nested tags (e.g., <span>) is also extracted.
235+
private void traverseChildNodesForNestedTags(final Node parent,
236+
final StringBuilder text) {
237+
for (final Node child : parent.childNodes()) {
238+
extractText(child, text);
239+
}
240+
}
241+
242+
// CHECKSTYLE:OFF checkstyle:JavadocStyle
243+
// checkstyle does not understand that span tags are inside a code block
244+
/**
245+
* <p>Recursive method to extract text from all nodes.</p>
246+
* <p>
247+
* This method processes {@link TextNode}s and {@code <br>} tags,
248+
* recursively extracting text from nested tags
249+
* (e.g. extracting text from nested {@code <span>} tags).
250+
* Newlines are added for {@code <br>} tags.
251+
* </p>
252+
* @param node the current node to process
253+
* @param text the {@link StringBuilder} to append the extracted text to
254+
*/
255+
// --------------------------------------------------------------------
256+
// [INTERNAL NOTE] TTML text layer explanation
257+
//
258+
// TTML parsing involves multiple text "layers":
259+
// 1. Raw XML entities (e.g., &lt;, &#xA0;) are decoded by Jsoup.
260+
// 2. extractText() works on DOM TextNodes (already parsed strings).
261+
// 3. sanitizeFragment() decodes remaining entities and fixes
262+
// Unicode quirks.
263+
// 4. normalizeForSrt() ensures literal text is safe for SRT output.
264+
//
265+
// In short:
266+
// Jsoup handles XML-level syntax,
267+
// our code handles text-level normalization for subtitles.
268+
// --------------------------------------------------------------------
269+
private void extractText(final Node node, final StringBuilder text) {
270+
if (node instanceof TextNode textNode) {
271+
String rawTtmlFragment = textNode.getWholeText();
272+
String srtContent = sanitizeFragment(rawTtmlFragment);
273+
text.append(srtContent);
274+
} else if (node instanceof Element element) {
275+
// <br> is a self-closing HTML tag used to insert a line break.
276+
if (element.tagName().equalsIgnoreCase("br")) {
277+
// Add a newline for <br> tags
278+
text.append(NEW_LINE);
279+
}
280+
}
281+
282+
traverseChildNodesForNestedTags(node, text);
283+
}
284+
// CHECKSTYLE:ON
285+
62286
public void build(final SharpStream ttml) throws IOException {
63287
/*
64288
* TTML parser with BASIC support
@@ -79,21 +303,15 @@ public void build(final SharpStream ttml) throws IOException {
79303
final Elements paragraphList = doc.select("body > div > p");
80304

81305
// check if has frames
82-
if (paragraphList.size() < 1) {
306+
if (paragraphList.isEmpty()) {
83307
return;
84308
}
85309

86310
for (final Element paragraph : paragraphList) {
87311
text.setLength(0);
88312

89-
for (final Node children : paragraph.childNodes()) {
90-
if (children instanceof TextNode) {
91-
text.append(((TextNode) children).text());
92-
} else if (children instanceof Element
93-
&& ((Element) children).tagName().equalsIgnoreCase("br")) {
94-
text.append(NEW_LINE);
95-
}
96-
}
313+
// Recursively extract text from all child nodes
314+
extractText(paragraph, text);
97315

98316
if (ignoreEmptyFrames && text.length() < 1) {
99317
continue;

0 commit comments

Comments
 (0)