Skip to content

Commit e087230

Browse files
authored
srt_reader: hide extended formatting tags behind a configuration flag
#490
1 parent 8b4b35d commit e087230

File tree

7 files changed

+202
-20
lines changed

7 files changed

+202
-20
lines changed

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,18 @@ Default: `23`
201201

202202
Default: `true`
203203

204+
### SRT Reader configuration (`"srt_reader"`)
205+
206+
#### extended_tags
207+
208+
`"extended_tag" : true | false`
209+
210+
If `true`, the following extended formatting tags are supported: `{bold}`,
211+
`<bold>`, `{b}`, `{italic}`, `<italic>`, `{i}`, `{underline}`, `<underline>` and
212+
`{u}`.
213+
214+
Default: `false`
215+
204216
### VTT Writer configuration (`"vtt_writer"`)
205217

206218
#### line_position

src/main/python/ttconv/srt/config.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@
3030
from dataclasses import dataclass, field
3131
from ttconv.config import ModuleConfiguration
3232

33+
@dataclass
34+
class SRTReaderConfiguration(ModuleConfiguration):
35+
"""SRT reader configuration"""
36+
37+
@classmethod
38+
def name(cls):
39+
return "srt_reader"
40+
41+
# enables support for extended tags: {b}, {bold}, <bold> (and italic/underline equivalents)
42+
extended_tags: bool = field(default=False, metadata={"decoder": bool})
43+
3344
@dataclass
3445
class SRTWriterConfiguration(ModuleConfiguration):
3546
"""SRT writer configuration"""

src/main/python/ttconv/srt/reader.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from ttconv import model
3737
from ttconv import style_properties as styles
3838
from ttconv.utils import parse_color
39+
from ttconv.srt.config import SRTReaderConfiguration
3940

4041
LOGGER = logging.getLogger(__name__)
4142

@@ -61,11 +62,11 @@ def handle_starttag(self, tag, attrs):
6162
self.parent.push_child(span)
6263
self.parent = span
6364

64-
if tag.lower() in ("b", "bold"):
65+
if tag.lower() in ("b"):
6566
span.set_style(styles.StyleProperties.FontWeight, styles.FontWeightType.bold)
66-
elif tag.lower() in ("i", "italic"):
67+
elif tag.lower() in ("i"):
6768
span.set_style(styles.StyleProperties.FontStyle, styles.FontStyleType.italic)
68-
elif tag.lower() in ("u", "underline"):
69+
elif tag.lower() in ("u"):
6970
span.set_style(styles.StyleProperties.TextDecoration, styles.TextDecorationType(underline=True))
7071
elif tag.lower() == "font":
7172
for attr in attrs:
@@ -116,9 +117,11 @@ class _State(Enum):
116117
_DEFAULT_OUTLINE_COLOR = styles.NamedColors.black.value
117118
_DEFAULT_LINE_HEIGHT = styles.LengthType(125, styles.LengthType.Units.pct)
118119

119-
def to_model(data_file: typing.IO, _config = None, progress_callback=lambda _: None):
120+
def to_model(data_file: typing.IO, _config: SRTReaderConfiguration = None, progress_callback=lambda _: None):
120121
"""Converts an SRT document to the data model"""
121122

123+
extended_tags = _config.extended_tags if isinstance(_config, SRTReaderConfiguration) else False
124+
122125
doc = model.ContentDocument()
123126

124127
region = model.Region(_DEFAULT_REGION_ID, doc)
@@ -236,14 +239,28 @@ def to_model(data_file: typing.IO, _config = None, progress_callback=lambda _: N
236239
if state in (_State.TEXT, _State.TEXT_MORE):
237240

238241
if line is None or _EMPTY_RE.fullmatch(line):
239-
subtitle_text = subtitle_text.strip('\r\n')\
240-
.replace(r"\n\r", "\n")\
241-
.replace(r"{bold}", r"<bold>")\
242-
.replace(r"{/bold}", r"</bold>")\
243-
.replace(r"{italic}", r"<italic>")\
244-
.replace(r"{/italic}", r"</italic>")\
245-
.replace(r"{underline}", r"<underline>")\
246-
.replace(r"{/underline}", r"</underline>")
242+
subtitle_text = subtitle_text.strip('\r\n').replace(r"\n\r", "\n")
243+
244+
if extended_tags:
245+
subtitle_text = subtitle_text\
246+
.replace(r"{b}", r"<b>")\
247+
.replace(r"{/b}", r"</b>")\
248+
.replace(r"{bold}", r"<b>")\
249+
.replace(r"{/bold}", r"</b>")\
250+
.replace(r"<bold>", r"<b>")\
251+
.replace(r"</bold>", r"</b>")\
252+
.replace(r"{i}", r"<i>")\
253+
.replace(r"{/i}", r"</i>")\
254+
.replace(r"{italic}", r"<i>")\
255+
.replace(r"{/italic}", r"</i>")\
256+
.replace(r"<italic>", r"<i>")\
257+
.replace(r"</italic>", r"</i>")\
258+
.replace(r"{u}", r"<u>")\
259+
.replace(r"{/u}", r"</u>")\
260+
.replace(r"{underline}", r"<u>")\
261+
.replace(r"{/underline}", r"</u>")\
262+
.replace(r"<underline>", r"<u>")\
263+
.replace(r"</underline>", r"</u>")
247264

248265
parser = _TextParser(current_p, line_index)
249266
parser.feed(subtitle_text)

src/main/python/ttconv/tt.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,16 @@
5252
from ttconv.isd import ISDConfiguration
5353
from ttconv.scc.config import SccReaderConfiguration, SccWriterConfiguration
5454
from ttconv.stl.config import STLReaderConfiguration
55-
from ttconv.srt.config import SRTWriterConfiguration
55+
from ttconv.srt.config import SRTReaderConfiguration, SRTWriterConfiguration
5656

5757
LOGGER = logging.getLogger("ttconv")
5858

5959
CONFIGURATIONS = [
6060
GeneralConfiguration,
6161
IMSCWriterConfiguration,
6262
ISDConfiguration,
63-
SccReaderConfiguration
63+
SccReaderConfiguration,
64+
SRTReaderConfiguration,
6465
]
6566

6667

@@ -335,12 +336,16 @@ def convert(args):
335336
model = stl_reader.to_model(f, reader_config, progress_callback_read)
336337

337338
elif reader_type is FileTypes.SRT:
339+
#
340+
# Read the config
341+
#
342+
reader_config = read_config_from_json(SRTReaderConfiguration, json_config_data)
338343

339344
#
340345
# Open the file and pass it to the reader
341346
#
342347
with open(inputfile, "r", encoding="utf-8") as f:
343-
model = srt_reader.to_model(f, None, progress_callback_read)
348+
model = srt_reader.to_model(f, reader_config, progress_callback_read)
344349

345350
elif reader_type is FileTypes.VTT:
346351

src/test/python/test_srt_reader.py

Lines changed: 119 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import unittest
3030
import io
3131

32+
from ttconv.srt.config import SRTReaderConfiguration
3233
from ttconv.srt.reader import to_model
3334
import ttconv.style_properties as styles
3435
import ttconv.model as model
@@ -67,7 +68,7 @@ def test_sample(self):
6768
def test_bold(self):
6869
f = io.StringIO(r"""1
6970
00:02:16,612 --> 00:02:19,376
70-
Hello <bold>my</bold> name is Bob
71+
Hello <b>my</b> name is Bob
7172
""")
7273
doc = to_model(f)
7374
for e in doc.get_body().dfs_iterator():
@@ -107,17 +108,53 @@ def test_bold_alt(self):
107108
00:02:16,612 --> 00:02:19,376
108109
Hello {bold}my{/bold} name is Bob
109110
""")
111+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
112+
for e in doc.get_body().dfs_iterator():
113+
if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold:
114+
break
115+
else:
116+
self.fail()
117+
doc = to_model(f)
118+
for e in doc.get_body().dfs_iterator():
119+
if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold:
120+
self.fail()
121+
122+
def test_bold_alt2(self):
123+
f = io.StringIO(r"""1
124+
00:02:16,612 --> 00:02:19,376
125+
Hello <bold>my</bold> name is Bob
126+
""")
127+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
128+
for e in doc.get_body().dfs_iterator():
129+
if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold:
130+
break
131+
else:
132+
self.fail()
110133
doc = to_model(f)
134+
for e in doc.get_body().dfs_iterator():
135+
if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold:
136+
self.fail()
137+
138+
def test_bold_alt3(self):
139+
f = io.StringIO(r"""1
140+
00:02:16,612 --> 00:02:19,376
141+
Hello {b}my{/b} name is Bob
142+
""")
143+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
111144
for e in doc.get_body().dfs_iterator():
112145
if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold:
113146
break
114147
else:
115148
self.fail()
149+
doc = to_model(f)
150+
for e in doc.get_body().dfs_iterator():
151+
if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold:
152+
self.fail()
116153

117154
def test_italic(self):
118155
f = io.StringIO(r"""1
119156
00:02:16,612 --> 00:02:19,376
120-
Hello <italic>my</italic> name is Bob
157+
Hello <i>my</i> name is Bob
121158
""")
122159
doc = to_model(f)
123160
for e in doc.get_body().dfs_iterator():
@@ -131,17 +168,53 @@ def test_italic_alt(self):
131168
00:02:16,612 --> 00:02:19,376
132169
Hello {italic}my{/italic} name is Bob
133170
""")
171+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
172+
for e in doc.get_body().dfs_iterator():
173+
if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic:
174+
break
175+
else:
176+
self.fail()
177+
doc = to_model(f)
178+
for e in doc.get_body().dfs_iterator():
179+
if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic:
180+
self.fail()
181+
182+
def test_italic_alt1(self):
183+
f = io.StringIO(r"""1
184+
00:02:16,612 --> 00:02:19,376
185+
Hello {i}my{/i} name is Bob
186+
""")
187+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
188+
for e in doc.get_body().dfs_iterator():
189+
if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic:
190+
break
191+
else:
192+
self.fail()
134193
doc = to_model(f)
194+
for e in doc.get_body().dfs_iterator():
195+
if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic:
196+
self.fail()
197+
198+
def test_italic_alt2(self):
199+
f = io.StringIO(r"""1
200+
00:02:16,612 --> 00:02:19,376
201+
Hello <italic>my</italic> name is Bob
202+
""")
203+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
135204
for e in doc.get_body().dfs_iterator():
136205
if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic:
137206
break
138207
else:
139208
self.fail()
209+
doc = to_model(f)
210+
for e in doc.get_body().dfs_iterator():
211+
if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic:
212+
self.fail()
140213

141214
def test_underline(self):
142215
f = io.StringIO(r"""1
143216
00:02:16,612 --> 00:02:19,376
144-
Hello <underline>my</underline> name is Bob
217+
Hello <u>my</u> name is Bob
145218
""")
146219
doc = to_model(f)
147220
for e in doc.get_body().dfs_iterator():
@@ -156,13 +229,54 @@ def test_underline_alt(self):
156229
00:02:16,612 --> 00:02:19,376
157230
Hello {underline}my{/underline} name is Bob
158231
""")
232+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
233+
for e in doc.get_body().dfs_iterator():
234+
text_decoration = e.get_style(styles.StyleProperties.TextDecoration)
235+
if text_decoration is not None and text_decoration.underline:
236+
break
237+
else:
238+
self.fail()
239+
doc = to_model(f)
240+
for e in doc.get_body().dfs_iterator():
241+
text_decoration = e.get_style(styles.StyleProperties.TextDecoration)
242+
if text_decoration is not None and text_decoration.underline:
243+
self.fail()
244+
245+
def test_underline_alt1(self):
246+
f = io.StringIO(r"""1
247+
00:02:16,612 --> 00:02:19,376
248+
Hello {u}my{/u} name is Bob
249+
""")
250+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
251+
for e in doc.get_body().dfs_iterator():
252+
text_decoration = e.get_style(styles.StyleProperties.TextDecoration)
253+
if text_decoration is not None and text_decoration.underline:
254+
break
255+
else:
256+
self.fail()
159257
doc = to_model(f)
258+
for e in doc.get_body().dfs_iterator():
259+
text_decoration = e.get_style(styles.StyleProperties.TextDecoration)
260+
if text_decoration is not None and text_decoration.underline:
261+
self.fail()
262+
263+
def test_underline_alt2(self):
264+
f = io.StringIO(r"""1
265+
00:02:16,612 --> 00:02:19,376
266+
Hello <underline>my</underline> name is Bob
267+
""")
268+
doc = to_model(f, SRTReaderConfiguration(extended_tags=True))
160269
for e in doc.get_body().dfs_iterator():
161270
text_decoration = e.get_style(styles.StyleProperties.TextDecoration)
162271
if text_decoration is not None and text_decoration.underline:
163272
break
164273
else:
165274
self.fail()
275+
doc = to_model(f)
276+
for e in doc.get_body().dfs_iterator():
277+
text_decoration = e.get_style(styles.StyleProperties.TextDecoration)
278+
if text_decoration is not None and text_decoration.underline:
279+
self.fail()
166280

167281
def test_blue(self):
168282
f = io.StringIO(r"""1
@@ -180,8 +294,8 @@ def test_blue(self):
180294
def test_multiline_tags(self):
181295
f = io.StringIO(r"""1
182296
00:02:16,612 --> 00:02:19,376
183-
Hello <bold>my
184-
</bold> name is Bob
297+
Hello <b>my
298+
</b> name is Bob
185299
""")
186300
doc = to_model(f)
187301
for e in doc.get_body().dfs_iterator():

src/test/python/test_tt.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,26 @@ def test_lcd_filter(self):
230230
'--filter', 'lcd',
231231
'--config', '{"lcd": {"bg_color":"red"}}'
232232
])
233+
234+
def test_srt_reader_extended_tags(self):
235+
in_path = "src/test/resources/srt/extended-tags.srt"
236+
237+
out_path = "build/extended_tags.ttml"
238+
tt.main(['convert',
239+
'-i', in_path,
240+
'-o', out_path,
241+
'--config', '{"srt_reader": {"extended_tags": true}}'
242+
])
243+
with open(out_path, encoding="utf-8") as f:
244+
self.assertRegex(f.read(), "fontWeight")
245+
246+
out_path = "build/no-extended_tags.ttml"
247+
tt.main(['convert',
248+
'-i', in_path,
249+
'-o', out_path
250+
])
251+
with open(out_path, encoding="utf-8") as f:
252+
self.assertNotRegex(f.read(), "fontWeight")
233253

234254
def test_imsc11filter(self):
235255
out_path = "build/imsc11filter.ttml"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
1
2+
00:02:16,612 --> 00:02:19,376
3+
Hello <bold>my</bold> name is Bob

0 commit comments

Comments
 (0)