diff --git a/README.md b/README.md index 2e11b7f3..084f738d 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,18 @@ Default: `23` Default: `true` +### SRT Reader configuration (`"srt_reader"`) + +#### extended_tags + +`"extended_tag" : true | false` + +If `true`, the following extended formatting tags are supported: `{bold}`, +``, `{b}`, `{italic}`, ``, `{i}`, `{underline}`, `` and +`{u}`. + +Default: `false` + ### VTT Writer configuration (`"vtt_writer"`) #### line_position diff --git a/src/main/python/ttconv/srt/config.py b/src/main/python/ttconv/srt/config.py index 2cfcf70d..52badca3 100644 --- a/src/main/python/ttconv/srt/config.py +++ b/src/main/python/ttconv/srt/config.py @@ -30,6 +30,17 @@ from dataclasses import dataclass, field from ttconv.config import ModuleConfiguration +@dataclass +class SRTReaderConfiguration(ModuleConfiguration): + """SRT reader configuration""" + + @classmethod + def name(cls): + return "srt_reader" + + # enables support for extended tags: {b}, {bold}, (and italic/underline equivalents) + extended_tags: bool = field(default=False, metadata={"decoder": bool}) + @dataclass class SRTWriterConfiguration(ModuleConfiguration): """SRT writer configuration""" diff --git a/src/main/python/ttconv/srt/reader.py b/src/main/python/ttconv/srt/reader.py index 84f906af..7397625f 100644 --- a/src/main/python/ttconv/srt/reader.py +++ b/src/main/python/ttconv/srt/reader.py @@ -36,6 +36,7 @@ from ttconv import model from ttconv import style_properties as styles from ttconv.utils import parse_color +from ttconv.srt.config import SRTReaderConfiguration LOGGER = logging.getLogger(__name__) @@ -61,11 +62,11 @@ def handle_starttag(self, tag, attrs): self.parent.push_child(span) self.parent = span - if tag.lower() in ("b", "bold"): + if tag.lower() in ("b"): span.set_style(styles.StyleProperties.FontWeight, styles.FontWeightType.bold) - elif tag.lower() in ("i", "italic"): + elif tag.lower() in ("i"): span.set_style(styles.StyleProperties.FontStyle, styles.FontStyleType.italic) - elif tag.lower() in ("u", "underline"): + elif tag.lower() in ("u"): span.set_style(styles.StyleProperties.TextDecoration, styles.TextDecorationType(underline=True)) elif tag.lower() == "font": for attr in attrs: @@ -116,9 +117,11 @@ class _State(Enum): _DEFAULT_OUTLINE_COLOR = styles.NamedColors.black.value _DEFAULT_LINE_HEIGHT = styles.LengthType(125, styles.LengthType.Units.pct) -def to_model(data_file: typing.IO, _config = None, progress_callback=lambda _: None): +def to_model(data_file: typing.IO, _config: SRTReaderConfiguration = None, progress_callback=lambda _: None): """Converts an SRT document to the data model""" + extended_tags = _config.extended_tags if isinstance(_config, SRTReaderConfiguration) else False + doc = model.ContentDocument() region = model.Region(_DEFAULT_REGION_ID, doc) @@ -236,14 +239,28 @@ def to_model(data_file: typing.IO, _config = None, progress_callback=lambda _: N if state in (_State.TEXT, _State.TEXT_MORE): if line is None or _EMPTY_RE.fullmatch(line): - subtitle_text = subtitle_text.strip('\r\n')\ - .replace(r"\n\r", "\n")\ - .replace(r"{bold}", r"")\ - .replace(r"{/bold}", r"")\ - .replace(r"{italic}", r"")\ - .replace(r"{/italic}", r"")\ - .replace(r"{underline}", r"")\ - .replace(r"{/underline}", r"") + subtitle_text = subtitle_text.strip('\r\n').replace(r"\n\r", "\n") + + if extended_tags: + subtitle_text = subtitle_text\ + .replace(r"{b}", r"")\ + .replace(r"{/b}", r"")\ + .replace(r"{bold}", r"")\ + .replace(r"{/bold}", r"")\ + .replace(r"", r"")\ + .replace(r"", r"")\ + .replace(r"{i}", r"")\ + .replace(r"{/i}", r"")\ + .replace(r"{italic}", r"")\ + .replace(r"{/italic}", r"")\ + .replace(r"", r"")\ + .replace(r"", r"")\ + .replace(r"{u}", r"")\ + .replace(r"{/u}", r"")\ + .replace(r"{underline}", r"")\ + .replace(r"{/underline}", r"")\ + .replace(r"", r"")\ + .replace(r"", r"") parser = _TextParser(current_p, line_index) parser.feed(subtitle_text) diff --git a/src/main/python/ttconv/tt.py b/src/main/python/ttconv/tt.py index 0f0350d7..074f2b2c 100755 --- a/src/main/python/ttconv/tt.py +++ b/src/main/python/ttconv/tt.py @@ -52,7 +52,7 @@ from ttconv.isd import ISDConfiguration from ttconv.scc.config import SccReaderConfiguration, SccWriterConfiguration from ttconv.stl.config import STLReaderConfiguration -from ttconv.srt.config import SRTWriterConfiguration +from ttconv.srt.config import SRTReaderConfiguration, SRTWriterConfiguration LOGGER = logging.getLogger("ttconv") @@ -60,7 +60,8 @@ GeneralConfiguration, IMSCWriterConfiguration, ISDConfiguration, - SccReaderConfiguration + SccReaderConfiguration, + SRTReaderConfiguration, ] @@ -335,12 +336,16 @@ def convert(args): model = stl_reader.to_model(f, reader_config, progress_callback_read) elif reader_type is FileTypes.SRT: + # + # Read the config + # + reader_config = read_config_from_json(SRTReaderConfiguration, json_config_data) # # Open the file and pass it to the reader # with open(inputfile, "r", encoding="utf-8") as f: - model = srt_reader.to_model(f, None, progress_callback_read) + model = srt_reader.to_model(f, reader_config, progress_callback_read) elif reader_type is FileTypes.VTT: diff --git a/src/test/python/test_srt_reader.py b/src/test/python/test_srt_reader.py index 69bb410b..7acd1226 100644 --- a/src/test/python/test_srt_reader.py +++ b/src/test/python/test_srt_reader.py @@ -29,6 +29,7 @@ import unittest import io +from ttconv.srt.config import SRTReaderConfiguration from ttconv.srt.reader import to_model import ttconv.style_properties as styles import ttconv.model as model @@ -67,7 +68,7 @@ def test_sample(self): def test_bold(self): f = io.StringIO(r"""1 00:02:16,612 --> 00:02:19,376 -Hello my name is Bob +Hello my name is Bob """) doc = to_model(f) for e in doc.get_body().dfs_iterator(): @@ -107,17 +108,53 @@ def test_bold_alt(self): 00:02:16,612 --> 00:02:19,376 Hello {bold}my{/bold} name is Bob """) + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold: + break + else: + self.fail() + doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold: + self.fail() + + def test_bold_alt2(self): + f = io.StringIO(r"""1 +00:02:16,612 --> 00:02:19,376 +Hello my name is Bob +""") + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold: + break + else: + self.fail() doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold: + self.fail() + + def test_bold_alt3(self): + f = io.StringIO(r"""1 +00:02:16,612 --> 00:02:19,376 +Hello {b}my{/b} name is Bob +""") + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) for e in doc.get_body().dfs_iterator(): if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold: break else: self.fail() + doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontWeight) == styles.FontWeightType.bold: + self.fail() def test_italic(self): f = io.StringIO(r"""1 00:02:16,612 --> 00:02:19,376 -Hello my name is Bob +Hello my name is Bob """) doc = to_model(f) for e in doc.get_body().dfs_iterator(): @@ -131,17 +168,53 @@ def test_italic_alt(self): 00:02:16,612 --> 00:02:19,376 Hello {italic}my{/italic} name is Bob """) + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic: + break + else: + self.fail() + doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic: + self.fail() + + def test_italic_alt1(self): + f = io.StringIO(r"""1 +00:02:16,612 --> 00:02:19,376 +Hello {i}my{/i} name is Bob +""") + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic: + break + else: + self.fail() doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic: + self.fail() + + def test_italic_alt2(self): + f = io.StringIO(r"""1 +00:02:16,612 --> 00:02:19,376 +Hello my name is Bob +""") + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) for e in doc.get_body().dfs_iterator(): if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic: break else: self.fail() + doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + if e.get_style(styles.StyleProperties.FontStyle) == styles.FontStyleType.italic: + self.fail() def test_underline(self): f = io.StringIO(r"""1 00:02:16,612 --> 00:02:19,376 -Hello my name is Bob +Hello my name is Bob """) doc = to_model(f) for e in doc.get_body().dfs_iterator(): @@ -156,13 +229,54 @@ def test_underline_alt(self): 00:02:16,612 --> 00:02:19,376 Hello {underline}my{/underline} name is Bob """) + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) + for e in doc.get_body().dfs_iterator(): + text_decoration = e.get_style(styles.StyleProperties.TextDecoration) + if text_decoration is not None and text_decoration.underline: + break + else: + self.fail() + doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + text_decoration = e.get_style(styles.StyleProperties.TextDecoration) + if text_decoration is not None and text_decoration.underline: + self.fail() + + def test_underline_alt1(self): + f = io.StringIO(r"""1 +00:02:16,612 --> 00:02:19,376 +Hello {u}my{/u} name is Bob +""") + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) + for e in doc.get_body().dfs_iterator(): + text_decoration = e.get_style(styles.StyleProperties.TextDecoration) + if text_decoration is not None and text_decoration.underline: + break + else: + self.fail() doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + text_decoration = e.get_style(styles.StyleProperties.TextDecoration) + if text_decoration is not None and text_decoration.underline: + self.fail() + + def test_underline_alt2(self): + f = io.StringIO(r"""1 +00:02:16,612 --> 00:02:19,376 +Hello my name is Bob +""") + doc = to_model(f, SRTReaderConfiguration(extended_tags=True)) for e in doc.get_body().dfs_iterator(): text_decoration = e.get_style(styles.StyleProperties.TextDecoration) if text_decoration is not None and text_decoration.underline: break else: self.fail() + doc = to_model(f) + for e in doc.get_body().dfs_iterator(): + text_decoration = e.get_style(styles.StyleProperties.TextDecoration) + if text_decoration is not None and text_decoration.underline: + self.fail() def test_blue(self): f = io.StringIO(r"""1 @@ -180,8 +294,8 @@ def test_blue(self): def test_multiline_tags(self): f = io.StringIO(r"""1 00:02:16,612 --> 00:02:19,376 -Hello my - name is Bob +Hello my + name is Bob """) doc = to_model(f) for e in doc.get_body().dfs_iterator(): diff --git a/src/test/python/test_tt.py b/src/test/python/test_tt.py index 6690f7ac..76de0edd 100644 --- a/src/test/python/test_tt.py +++ b/src/test/python/test_tt.py @@ -230,6 +230,26 @@ def test_lcd_filter(self): '--filter', 'lcd', '--config', '{"lcd": {"bg_color":"red"}}' ]) + + def test_srt_reader_extended_tags(self): + in_path = "src/test/resources/srt/extended-tags.srt" + + out_path = "build/extended_tags.ttml" + tt.main(['convert', + '-i', in_path, + '-o', out_path, + '--config', '{"srt_reader": {"extended_tags": true}}' + ]) + with open(out_path, encoding="utf-8") as f: + self.assertRegex(f.read(), "fontWeight") + + out_path = "build/no-extended_tags.ttml" + tt.main(['convert', + '-i', in_path, + '-o', out_path + ]) + with open(out_path, encoding="utf-8") as f: + self.assertNotRegex(f.read(), "fontWeight") def test_imsc11filter(self): out_path = "build/imsc11filter.ttml" diff --git a/src/test/resources/srt/extended-tags.srt b/src/test/resources/srt/extended-tags.srt new file mode 100644 index 00000000..0b8b84d2 --- /dev/null +++ b/src/test/resources/srt/extended-tags.srt @@ -0,0 +1,3 @@ +1 +00:02:16,612 --> 00:02:19,376 +Hello my name is Bob