Merge pull request #339 from pbs/OCTO-11027-BS-should-be-skipped-if-PAC-commands-are-duplicated

OlteanuRares · web-flow · commit 6ae805c11536 · 2024-05-30T10:54:31.000+03:00
Octo 11027 bs should be skipped if pac commands are duplicated
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,9 +1,15 @@
 Changelog
 ---------
-2.2.10
-^^^^^
+2.2.11
+^^^^^^
 - A space should not be placed before a mid row code if it follows a PAC command or a Tab Offset
 - The backspace command should be treated like other commands and duplicates should be skipped if PAC commands are duplicated
+- Prevent webvtt writer from creating a new cue in case of line break
+- In case of style setting PAC which also breaks the line, we add the break first, then the style tag
+
+2.2.10
+^^^^^
+- Yanked.
 
 2.2.9
 ^^^^^
diff --git a/docs/conf.py b/docs/conf.py
@@ -53,9 +53,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '2.2.10'
+version = '2.2.11'
 # The full version, including alpha/beta/rc tags.
-release = '2.2.10'
+release = '2.2.11'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/pycaption/base.py b/pycaption/base.py
@@ -114,13 +114,16 @@ class CaptionNode:
     STYLE = 2
     BREAK = 3
 
-    def __init__(self, type_, layout_info=None, content=None, start=None):
+    def __init__(
+            self, type_, layout_info=None, content=None, start=None, position=None
+    ):
         """
         :type type_: int
         :type layout_info: Layout
         """
         self.type_ = type_
         self.content = content
+        self.position = position
 
         # Boolean. Marks the beginning/ end of a Style node.
         self.start = start
@@ -139,19 +142,24 @@ def __repr__(self):
             raise RuntimeError(f'Unknown node type: {t}')
 
     @staticmethod
-    def create_text(text, layout_info=None):
+    def create_text(text, layout_info=None, position=None):
         return CaptionNode(
-            CaptionNode.TEXT, layout_info=layout_info, content=text)
+            type_=CaptionNode.TEXT, layout_info=layout_info,
+            position=position, content=text
+        )
 
     @staticmethod
     def create_style(start, content, layout_info=None):
         return CaptionNode(
-            CaptionNode.STYLE, layout_info=layout_info, content=content,
+            type_=CaptionNode.STYLE, layout_info=layout_info, content=content,
             start=start)
 
     @staticmethod
-    def create_break(layout_info=None):
-        return CaptionNode(CaptionNode.BREAK, layout_info=layout_info)
+    def create_break(layout_info=None, content=None):
+        return CaptionNode(
+            type_=CaptionNode.BREAK, layout_info=layout_info,
+            content=content
+        )
 
 
 class Caption:
diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py
@@ -94,7 +94,7 @@
     MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE,
     SPECIAL_OR_EXTENDED_CHAR_TO_CODE, PAC_BYTES_TO_POSITIONING_MAP,
     PAC_HIGH_BYTE_BY_ROW, PAC_LOW_BYTE_BY_ROW_RESTRICTED,
-    PAC_TAB_OFFSET_COMMANDS,
+    PAC_TAB_OFFSET_COMMANDS, CUE_STARTING_COMMAND
 )
 from .specialized_collections import (  # noqa: F401
     TimingCorrectingCaptionList, NotifyingDict, CaptionCreator,
@@ -164,6 +164,7 @@ def __init__(self, *args, **kw):
         )
 
         self.last_command = ''
+        self.double_starter = False
 
         self.buffer_dict = NotifyingDict()
 
@@ -223,6 +224,7 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
         # split lines
         lines = content.splitlines()
 
+
         # loop through each line except the first
         for line in lines[1:]:
             self._translate_line(line)
@@ -307,24 +309,21 @@ def _translate_line(self, line):
         parts = r.findall(line.lower())
 
         self.time_translator.start_at(parts[0][0])
-
         word_list = parts[0][2].split(' ')
-        pacs_are_doubled = len(word_list) > 1 and word_list[0] == word_list[1]
+
         for idx, word in enumerate(word_list):
-            # ignore empty results or invalid commands
             word = word.strip()
-            previous_is_pac_or_tab = idx > 0 and (
-                _is_pac_command(word_list[idx-1]) or word_list[idx-1] in PAC_TAB_OFFSET_COMMANDS
+            previous_is_pac_or_tab = len(word_list) > 1 and (
+                    _is_pac_command(word_list[idx - 1]) or word_list[idx - 1] in PAC_TAB_OFFSET_COMMANDS
             )
             if len(word) == 4:
                 self._translate_word(
                     word=word,
                     previous_is_pac_or_tab=previous_is_pac_or_tab,
-                    pacs_are_doubled=pacs_are_doubled
                 )
 
-    def _translate_word(self, word, previous_is_pac_or_tab, pacs_are_doubled):
-        if self._handle_double_command(word, pacs_are_doubled):
+    def _translate_word(self, word, previous_is_pac_or_tab):
+        if self._handle_double_command(word):
             # count frames for timing
             self.time_translator.increment_frames()
             return
@@ -348,19 +347,25 @@ def _translate_word(self, word, previous_is_pac_or_tab, pacs_are_doubled):
         # count frames for timing only after processing a command
         self.time_translator.increment_frames()
 
-    def _handle_double_command(self, word, pacs_are_doubled):
+    def _handle_double_command(self, word):
         # If the caption is to be broadcast, each of the commands are doubled
         # up for redundancy in case the signal is garbled in transmission.
         # The decoder is programmed to ignore a second command when it is the
         # same as the first.
         # If we have doubled commands we're skipping also
         # doubled special characters and doubled extended characters
         # with only one member of each pair being displayed.
-        doubled_types = word in COMMANDS or _is_pac_command(word)
-        if pacs_are_doubled:
-            doubled_types = doubled_types or word in SPECIAL_CHARS or word in EXTENDED_CHARS
+
+        doubled_types = word != "94a1" and word in COMMANDS or _is_pac_command(word)
+        if self.double_starter:
+            doubled_types = doubled_types or word in EXTENDED_CHARS or word == "94a1" or word in SPECIAL_CHARS
+
+        if word in CUE_STARTING_COMMAND and word != self.last_command:
+            self.double_starter = False
 
         if doubled_types and word == self.last_command:
+            if word in CUE_STARTING_COMMAND:
+                self.double_starter = True
             self.last_command = ''
             return True
             # Fix for the <position> <tab offset> <position> <tab offset>
diff --git a/pycaption/scc/constants.py b/pycaption/scc/constants.py
@@ -1058,3 +1058,5 @@ def _restructure_bytes_to_position_map(byte_to_pos_map):
     "10a7", "10a8", "1029", "102a", "10ab", "102c", "10ad",
     "10ae", "102f", "97ad"
 ]
+
+CUE_STARTING_COMMAND = ['9425', '9426', '94a7', '9429', '9420']
diff --git a/pycaption/scc/specialized_collections.py b/pycaption/scc/specialized_collections.py
@@ -1,5 +1,4 @@
 import collections
-import unicodedata
 
 from ..base import CaptionList, Caption, CaptionNode
 from ..geometry import (
@@ -9,7 +8,7 @@
 from .constants import (
     PAC_BYTES_TO_POSITIONING_MAP, COMMANDS, PAC_TAB_OFFSET_COMMANDS,
     MICROSECONDS_PER_CODEWORD, BACKGROUND_COLOR_CODES,
-    MID_ROW_CODES, EXTENDED_CHARS, SPECIAL_CHARS
+    MID_ROW_CODES, EXTENDED_CHARS
 )
 
 PopOnCue = collections.namedtuple("PopOnCue", "buffer, start, end")
@@ -255,7 +254,10 @@ def create_and_store(self, node_buffer, start, end=0):
                 layout_info = _get_layout_from_tuple(instruction.position)
                 caption.nodes.append(
                     CaptionNode.create_text(
-                        instruction.text, layout_info=layout_info),
+                        text=instruction.text,
+                        layout_info=layout_info,
+                        position=instruction.position
+                    )
                 )
                 caption.layout_info = layout_info
 
@@ -366,6 +368,10 @@ def interpret_command(self, command, previous_is_pac_or_tab=False):
                 self._collection[-1].text = self._collection[-1].text[:-1]
 
         if 'italic' in text:
+            if self._position_tracer.is_linebreak_required():
+                self._collection.append(_InstructionNode.create_break(
+                    position=self._position_tracer.get_current_position()))
+                self._position_tracer.acknowledge_linebreak_consumed()
             if 'end' not in text:
                 self._collection.append(
                     _InstructionNode.create_italics_style(
diff --git a/pycaption/webvtt.py b/pycaption/webvtt.py
@@ -394,6 +394,7 @@ def _group_cues_by_layout(self, nodes, caption_set):
             return []
 
         current_layout = None
+        current_node = None
 
         # A list with layout groups. Since WebVTT only support positioning
         # for different cues, each layout group has to be represented in a
@@ -402,17 +403,24 @@ def _group_cues_by_layout(self, nodes, caption_set):
         # A properly encoded WebVTT string (plain unicode must be properly
         # escaped before being appended to this string)
         s = ''
+        row, column, prev_row, prev_column = 0, 0, 0, 0
         for i, node in enumerate(nodes):
             if node.type_ == CaptionNode.TEXT:
                 if s and current_layout and node.layout_info != current_layout:
                     # If the positioning changes from one text node to
                     # another, a new WebVTT cue has to be created.
-                    layout_groups.append((s, current_layout))
-                    s = ''
+                    row, column = node.position if node.position else (0, 0)
+                    prev_row, prev_column = current_node.position if current_node.position else (0, 0)
+                    if row == prev_row + 1:
+                        s += '\n'
+                    else:
+                        layout_groups.append((s, current_layout))
+                        s = ''
                 # ATTENTION: This is where the plain unicode node content is
                 # finally encoded as WebVTT.
                 s += self._encode_illegal_characters(node.content) or '&nbsp;'
                 current_layout = node.layout_info
+                current_node = node
             elif node.type_ == CaptionNode.STYLE:
                 resulting_style = self._calculate_resulting_style(
                     node.content, caption_set
diff --git a/setup.py b/setup.py
@@ -24,7 +24,7 @@
 
 setup(
     name='pycaption',
-    version='2.2.10',
+    version='2.2.11',
     description='Closed caption converter',
     long_description=open(README_PATH).read(),
     author='Joe Norton',
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -54,14 +54,14 @@
     scc_that_generates_webvtt_with_proper_newlines,
     sample_scc_produces_captions_with_start_and_end_time_the_same,
     sample_scc_pop_on, sample_scc_multiple_positioning, sample_scc_with_italics,
-    sample_scc_empty, sample_scc_roll_up_ru2, sample_no_positioning_at_all_scc,
+    sample_scc_empty, sample_scc_roll_up_ru2, sample_scc_roll_up_ru3,
+    sample_no_positioning_at_all_scc, sample_scc_with_line_too_long,
     sample_scc_no_explicit_end_to_last_caption, sample_scc_flashing_cue,
     sample_scc_eoc_first_command, sample_scc_with_extended_characters,
     sample_scc_with_ampersand_character, sample_scc_multiple_formats,
     sample_scc_duplicate_tab_offset, sample_scc_duplicate_special_characters,
     sample_scc_tab_offset, sample_scc_with_unknown_commands,
     sample_scc_special_and_extended_characters,
-    sample_scc_with_line_too_long
 )
 from tests.fixtures.srt import (  # noqa: F401
     sample_srt, sample_srt_ascii, sample_srt_numeric, sample_srt_empty,
diff --git a/tests/fixtures/scc.py b/tests/fixtures/scc.py
@@ -140,6 +140,46 @@ def sample_scc_roll_up_ru2():
 
 00:00:12;07    9425 9425 94ad 94ad 9470 9470 91b0 9131 9132 9132
 
+00:00:12;30    9425 94ad 94ad 9470 9470 91b0 9131 9132 9132
+
+00:00:13;07    9425 9425 94ad 94ad 9470 9470 c1c2 c3c4 c580 91bf
+
+00:00:14;07    9425 9425 94ad 94ad 9470 9470 9220 9220 92a1 92a2 92a7
+
+00:00:17;01    9426 9426 94ad 94ad 9470 9470 57c8 4552 4520 d94f d5a7 5245 20d3 54c1 cec4 49ce c720 ce4f 572c
+
+00:00:18;19    9426 9426 94ad 94ad 9470 9470 4c4f 4fcb 49ce c720 4fd5 5420 54c8 4552 452c 2054 c8c1 54a7 d320 c14c 4c
+
+00:00:20;06    9426 9426 94ad 94ad 9470 9470 54c8 4520 4352 4f57 c4ae
+
+00:00:21;24    9426 9426 94ad 94ad 9470 9470 3e3e 2049 5420 57c1 d320 c74f 4fc4 2054 4f20 c245 2049 ce20 54c8 45
+
+00:00:34;27    94a7 94ad 9470 c16e 6420 f2e5 73f4 eff2 e520 49ef f761 a773 20ec 616e 642c 20f7 61f4 e5f2
+
+00:00:36;12    94a7 94ad 9470 c16e 6420 f7e9 ec64 ece9 e6e5 ae80
+
+00:00:44;08    94a7 94ad 9470 3e3e 20c2 e96b e520 49ef f761 2c20 79ef 75f2 2073 ef75 f2e3 e520 e6ef f280
+"""
+
+
+@pytest.fixture(scope="session")
+def sample_scc_roll_up_ru3():
+    return """\
+Scenarist_SCC V1.0
+00:00:00;22    9425 9425 94ad 94ad 9470 9470 3e3e 3e20 c849 ae80
+
+00:00:02;23    9425 9425 94ad 94ad 9470 9470 49a7 cd20 cb45 d649 ce20 43d5 cece 49ce c720 c1ce c420 c154
+
+00:00:04;17    9425 9425 94ad 94ad 9470 9470 49ce d645 d354 4f52 a7d3 20c2 c1ce cb20 5745 20c2 454c 4945 d645 2049 ce80
+
+00:00:06;04    9425 9425 94ad 94ad 9470 9470 c845 4cd0 49ce c720 54c8 4520 4c4f 43c1 4c20 ce45 49c7 c8c2 4f52 c84f 4fc4 d380
+
+00:00:09;21    9425 9425 94ad 94ad 9470 9470 c1ce c420 49cd d052 4fd6 49ce c720 54c8 4520 4c49 d645 d320 4f46 20c1 4c4c
+
+00:00:11;07    9425 9425 94ad 94ad 9470 9470 5745 20d3 4552 d645 ae80
+
+00:00:12;07    9425 9425 94ad 94ad 9470 9470 91b0 9131 9132 9132
+
 00:00:13;07    9425 9425 94ad 94ad 9470 9470 c1c2 c3c4 c580 91bf
 
 00:00:14;07    9425 9425 94ad 94ad 9470 9470 9220 9220 92a1 92a2 92a7
diff --git a/tests/test_scc.py b/tests/test_scc.py
@@ -227,7 +227,6 @@ def test_skip_duplicate_special_characters(
             '®°½¿™¢£♪à èâêîôû',
             '®°AA½¿™¢£♪à èâêAAîôû'
         ]
-
         caption_set = SCCReader().read(sample_scc_duplicate_special_characters)
         actual_lines = [
             node.content
@@ -279,6 +278,7 @@ def test_freeze_rollup_captions_contents(self, sample_scc_roll_up_ru2):
             'AND IMPROVING THE LIVES OF ALL',
             'WE SERVE.',
             '®°½',
+            '®°½½',
             'ABû',
             'ÁÉÓ¡',
             "WHERE YOU'RE STANDING NOW,",
@@ -323,8 +323,8 @@ def test_multiple_formats(self, sample_scc_multiple_formats):
 
         assert expected_text_lines == text_lines
 
-    def test_freeze_semicolon_spec_time(self, sample_scc_roll_up_ru2):
-        scc1 = SCCReader().read(sample_scc_roll_up_ru2)
+    def test_freeze_semicolon_spec_time(self, sample_scc_roll_up_ru3):
+        scc1 = SCCReader().read(sample_scc_roll_up_ru3)
         captions = scc1.get_captions('en-US')
         expected_timings = [
             (733333.3333333333, 2766666.6666666665),
@@ -346,7 +346,6 @@ def test_freeze_semicolon_spec_time(self, sample_scc_roll_up_ru2):
         ]
 
         actual_timings = [(c_.start, c_.end) for c_ in captions]
-
         assert expected_timings == actual_timings
 
     def test_freeze_colon_spec_time(self, sample_scc_pop_on):

Original file line number	Diff line number	Diff line change
`@@ -1058,3 +1058,5 @@ def _restructure_bytes_to_position_map(byte_to_pos_map):`
`1058`	`1058`	`"10a7", "10a8", "1029", "102a", "10ab", "102c", "10ad",`
`1059`	`1059`	`"10ae", "102f", "97ad"`
`1060`	`1060`	`]`
	`1061`	`+`
	`1062`	`+CUE_STARTING_COMMAND = ['9425', '9426', '94a7', '9429', '9420']`